/* $Id: HopCount.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.jobs;
import java.util.*;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
/** This class manages the table that keeps track of hop count, and algorithmically determines this value
* for a document identifier upon request.
*
* <br><br>
* <b>hopcount</b>
* <table border="1" cellpadding="3" cellspacing="0">
* <tr class="TableHeadingColor">
* <th>Field</th><th>Type</th><th>Description </th>
* <tr><td>id</td><td>BIGINT</td><td>Primary Key</td></tr>
* <tr><td>jobid</td><td>BIGINT</td><td>Reference:jobs.id</td></tr>
* <tr><td>linktype</td><td>VARCHAR(255)</td><td></td></tr>
* <tr><td>parentidhash</td><td>VARCHAR(40)</td><td></td></tr>
* <tr><td>distance</td><td>BIGINT</td><td></td></tr>
* <tr><td>deathmark</td><td>CHAR(1)</td><td></td></tr>
* </table>
* <br><br>
*
*/
public class HopCount extends org.apache.manifoldcf.core.database.BaseTable
{
public static final String _rcsid = "@(#)$Id: HopCount.java 988245 2010-08-23 18:39:35Z kwright $";
// Answer constants
public static final int ANSWER_UNKNOWN = -1;
public static final int ANSWER_INFINITY = -2;
// Notes on the schema
// ===================
//
// This schema consists of three interrelated tables. The table controlled directly by this class
// is the table where cached distance results are kept. It has a child table, which keeps track
// of certain dependencies, so that we have a way of figuring out relatively accurately which cached links
// need to be re-evaluated when there is a change. Finally, there is a related table where intrinsic
// (i.e. direct) link information is kept.
//
// When links are recorded, a source document refers to target documents. The convention here is
// that the source document is called the "child", and the target document is called the "parent".
// Also by convention, a child value of null means "the root". Since all cached distances are to
// the root, we only store the "parent" in the hopcount table.
//
// Each row in the main hopcount table is linked with the child tables by means of an id field.
//
// Database table management for hopcount determination
// ====================================================
//
// The critical operation we want to be able to do is to propagate the effects of a change throughout
// the cached data. I originally assumed that that meant "blowing the cache" - deleting all minimum
// hop counts stored in the database which corresponded to the link we have added or deleted.
// However, after the naive algorithm ran for a while, it became clear that it was not going to perform
// well, because the sheer quantity of dependency information made management of dependencies far
// exceed reason. Caching of hopcount, however, still was clearly essential, because when I removed
// the caching completely, things just plain wedged.
//
// Then I realized that by far the most common activity involves adding links to the graph, and therefore
// if I could optimize that activity without storing huge quantities of dependency information, the
// performance goals would be met. So, this is how the thinking went:
//
// - We always start with a graph where the cached hopcount values only exist IF the hopcount values
// that were needed to come up with that value also exist. Any changes to the graph MUST preserve this
// situation.
// - Under these conditions, adding a link between a source and target could encounter either of two conditions:
// (a) the target has no cached hopcount, or
// (b) the target DOES have a cached hopcount.
// In case (a), we must treat the existing non-record as meaning "infinite distance", which is clearly wrong.
// We therefore must create a record for that location, which has a value of infinity. After that, treat this
// the exact same way as for (b).
// In the case of (b), we need to re-evaluate the hopcount with the new link in place,
// and compare it against the existing hopcount. The new value cannot be larger (unless the table was somehow corrupted),
// because adding a link can NEVER increase a hopcount. If the new hopcount is less than the old, then
// we change the value in the table, and examine all the target nodes in the same way. Most likely, the
// propagation will stop quickly, because there are lots of ways of getting to a node and this is just one
// of them.
// - When a link is deleted, we run the risk of leaving around disconnected loops that evaluate forever, if
// we use the same propagation algorithm. So instead, we want to keep track of what nodes will need reevaluation
// when a link is destroyed. This list is relatively small, since only the shortest possible path to a node
// is represented in this dependency information.
// So, when a link is deleted, the following steps take place. All the dependent hopcount nodes are queued, but
// in such a way as to be reset to having an "infinite" distance. Then, re-evaluation occurs in the same manner as for
// the add case above.
// - In order to determine the hopcount value of a node at any given time, all you need to do is to look for a cached
// hopcount value. If you find it, that's the right number. If you don't, you can presume the value is infinity.
//
//
// Activities that should occur when a hopcount changes
// ====================================================
//
// Documents in the job queue may be excluded from consideration based on hopcount. If the hopcount for a document changes
// (decreases), this assessment could well change. Therefore, this hopcount module MUST cause documents to be switched
// to a "pending" state whenever a hopcount change occurs that makes the document pass its hopcount filtering criteria.
//
//
// Field names
public static final String idField = "id";
public static final String jobIDField = "jobid";
public static final String linkTypeField = "linktype";
public static final String parentIDHashField = "parentidhash";
public static final String distanceField = "distance";
public static final String markForDeathField = "deathmark";
// Mark for death status
public static final int MARK_NORMAL = 0;
public static final int MARK_QUEUED = 1;
public static final int MARK_DELETING = 2;
protected static Map markMap;
static
{
markMap = new HashMap();
markMap.put("N",new Integer(MARK_NORMAL));
markMap.put("Q",new Integer(MARK_QUEUED));
markMap.put("D",new Integer(MARK_DELETING));
}
/** Intrinsic link table manager. */
protected IntrinsicLink intrinsicLinkManager;
/** Hop "delete" dependencies manager */
protected HopDeleteDeps deleteDepsManager;
/** Thread context */
protected IThreadContext threadContext;
/** Constructor.
*@param database is the database handle.
*/
public HopCount(IThreadContext tc, IDBInterface database)
throws ManifoldCFException
{
super(database,"hopcount");
this.threadContext = tc;
intrinsicLinkManager = new IntrinsicLink(database);
deleteDepsManager = new HopDeleteDeps(database);
}
/** Install or upgrade.
*/
public void install(String jobsTable, String jobsColumn)
throws ManifoldCFException
{
// Per convention, always have outer loop in install() methods
while (true)
{
Map existing = getTableSchema(null,null);
if (existing == null)
{
HashMap map = new HashMap();
map.put(idField,new ColumnDescription("BIGINT",true,false,null,null,false));
map.put(jobIDField,new ColumnDescription("BIGINT",false,false,jobsTable,jobsColumn,false));
map.put(linkTypeField,new ColumnDescription("VARCHAR(255)",false,true,null,null,false));
map.put(parentIDHashField,new ColumnDescription("VARCHAR(40)",false,false,null,null,false));
map.put(distanceField,new ColumnDescription("BIGINT",false,true,null,null,false));
map.put(markForDeathField,new ColumnDescription("CHAR(1)",false,false,null,null,false));
performCreate(map,null);
}
else
{
// Upgrade goes here, if needed
}
// Do child tables.
intrinsicLinkManager.install(jobsTable,jobsColumn);
deleteDepsManager.install(jobsTable,jobsColumn,getTableName(),idField);
// Do indexes
IndexDescription jobLinktypeParentIndex = new IndexDescription(true,new String[]{jobIDField,parentIDHashField,linkTypeField});
IndexDescription jobDeathIndex = new IndexDescription(false,new String[]{jobIDField,markForDeathField,parentIDHashField,linkTypeField});
Map indexes = getTableIndexes(null,null);
Iterator iter = indexes.keySet().iterator();
while (iter.hasNext())
{
String indexName = (String)iter.next();
IndexDescription id = (IndexDescription)indexes.get(indexName);
if (jobLinktypeParentIndex != null && id.equals(jobLinktypeParentIndex))
jobLinktypeParentIndex = null;
else if (jobDeathIndex != null && id.equals(jobDeathIndex))
jobDeathIndex = null;
else if (indexName.indexOf("_pkey") == -1)
// This index shouldn't be here; drop it
performRemoveIndex(indexName);
}
if (jobLinktypeParentIndex != null)
performAddIndex(null,jobLinktypeParentIndex);
if (jobDeathIndex != null)
performAddIndex(null,jobDeathIndex);
break;
}
}
/** Uninstall.
*/
public void deinstall()
throws ManifoldCFException
{
beginTransaction();
try
{
deleteDepsManager.deinstall();
intrinsicLinkManager.deinstall();
performDrop(null);
}
catch (ManifoldCFException e)
{
signalRollback();
throw e;
}
catch (Error e)
{
signalRollback();
throw e;
}
finally
{
endTransaction();
}
}
/** Go from string to mark.
*@param value is the string.
*@return the status value.
*/
public static int stringToMark(String value)
throws ManifoldCFException
{
Integer x = (Integer)markMap.get(value);
if (x == null)
throw new ManifoldCFException("Bad mark value: '"+value+"'");
return x.intValue();
}
/** Go from mark to string.
*@param mark is the mark.
*@return the string.
*/
public static String markToString(int mark)
throws ManifoldCFException
{
switch (mark)
{
case MARK_NORMAL:
return "N";
case MARK_QUEUED:
return "Q";
case MARK_DELETING:
return "D";
default:
throw new ManifoldCFException("Bad mark value");
}
}
/** Delete an owner (and clean up the corresponding hopcount rows).
*/
public void deleteOwner(Long jobID)
throws ManifoldCFException
{
// Delete the intrinsic rows belonging to this job.
intrinsicLinkManager.deleteOwner(jobID);
// Delete the deletedeps rows
deleteDepsManager.deleteJob(jobID);
// Delete our own rows.
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID)});
performDelete("WHERE "+query,list,null);
noteModifications(0,0,1);
}
/** Reset, at startup time.
*@param processID is the process ID.
*/
public void restart(String processID)
throws ManifoldCFException
{
intrinsicLinkManager.restart(processID);
}
/** Clean up after all process IDs.
*/
public void restart()
throws ManifoldCFException
{
intrinsicLinkManager.restart();
}
/** Restart entire cluster.
*/
public void restartCluster()
throws ManifoldCFException
{
intrinsicLinkManager.restartCluster();
}
/** Record a references from a set of documents to the root. These will be marked as "new" or "existing", and
* will have a null linktype.
*/
public void recordSeedReferences(Long jobID, String[] legalLinkTypes, String[] targetDocumentIDHashes, int hopcountMethod, String processID)
throws ManifoldCFException
{
doRecord(jobID,legalLinkTypes,"",targetDocumentIDHashes,"",hopcountMethod,processID);
}
/** Finish seed references. Seed references are special in that the only source is the root.
*/
public void finishSeedReferences(Long jobID, String[] legalLinkTypes, int hopcountMethod)
throws ManifoldCFException
{
doFinish(jobID,legalLinkTypes,new String[]{""},hopcountMethod);
}
/** Record a reference from source to target. This reference will be marked as "new" or "existing".
*/
public boolean recordReference(Long jobID, String[] legalLinkTypes, String sourceDocumentIDHash, String targetDocumentIDHash, String linkType,
int hopcountMethod, String processID)
throws ManifoldCFException
{
return doRecord(jobID,legalLinkTypes,sourceDocumentIDHash,new String[]{targetDocumentIDHash},linkType,hopcountMethod,processID)[0];
}
/** Record a set of references from source to target. This reference will be marked as "new" or "existing".
*/
public boolean[] recordReferences(Long jobID, String[] legalLinkTypes, String sourceDocumentIDHash, String[] targetDocumentIDHashes, String linkType,
int hopcountMethod, String processID)
throws ManifoldCFException
{
return doRecord(jobID,legalLinkTypes,sourceDocumentIDHash,targetDocumentIDHashes,linkType,hopcountMethod,processID);
}
/** Complete a recalculation pass for a set of source documents. All child links that are not marked as "new"
* or "existing" will be removed. At the completion of this pass, the links will have their "new" flag cleared.
*/
public void finishParents(Long jobID, String[] legalLinkTypes, String[] sourceDocumentHashes, int hopcountMethod)
throws ManifoldCFException
{
doFinish(jobID,legalLinkTypes,sourceDocumentHashes,hopcountMethod);
}
/** Revert newly-added links, because of a possibly incomplete document processing phase.
* All child links marked as "new" will be removed, and all links marked as "existing" will be
* reset to be "base".
*/
public void revertParents(Long jobID, String[] sourceDocumentHashes)
throws ManifoldCFException
{
intrinsicLinkManager.revertLinks(jobID,sourceDocumentHashes);
}
/** Do the work of recording source-target references. */
protected boolean[] doRecord(Long jobID, String[] legalLinkTypes, String sourceDocumentIDHash, String[] targetDocumentIDHashes, String linkType,
int hopcountMethod, String processID)
throws ManifoldCFException
{
// NOTE: In order for the revertParents() call above to be correct in its current form,
// this method would need to be revised to not process any additions until the finishParents() call
// is made. At the moment, revertParents() is not used by any thread.
// TBD, MHL
boolean[] rval = new boolean[targetDocumentIDHashes.length];
for (int i = 0; i < rval.length; i++)
{
rval[i] = false;
}
String[] newReferences = intrinsicLinkManager.recordReferences(jobID,sourceDocumentIDHash,targetDocumentIDHashes,linkType,processID);
if (newReferences.length > 0)
{
// There are added links.
// The add causes hopcount records to be queued for processing (and created if they don't exist).
// ALL the hopcount records for the target document ids must be queued, for all the link types
// there are for this job. Other times, the queuing requirement is less stringent, such as
// when a hopcount for one linktype changes. In those cases we only want to queue up hopcount
// records corresponding to the changed record.
// What we need to do is create a queue which contains only the target hopcount table rows, if they
// exist. Then we run the update algorithm until the cache is empty.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Queueing "+Integer.toString(targetDocumentIDHashes.length)+" documents");
// Since we really want efficiency, we can write the answer in place now, based on the current
// hopcount rows. This works even if the current row is out of date, because if we change the
// current row's value, the target rows will be requeued at that point.
// When we record new links, we must come up with an initial calculation or requeue ALL legal link
// types. If this isn't done, then we cannot guarantee that the target record will exist - and
// somebody will then interpret the distance as being 'infinity'.
//
// It would be possible to change this but we would then also need to change how a missing record
// would be interpreted.
//if (!(linkType == null || linkType.length() == 0))
// legalLinkTypes = new String[]{linkType};
// So, let's load what we have for hopcount and dependencies for sourceDocumentID.
Answer[] estimates = new Answer[legalLinkTypes.length];
if (sourceDocumentIDHash == null || sourceDocumentIDHash.length() == 0)
{
for (int i = 0; i < estimates.length; i++)
{
estimates[i] = new Answer(0);
}
}
else
{
StringBuilder sb = new StringBuilder("SELECT ");
ArrayList list = new ArrayList();
sb.append(idField).append(",")
.append(distanceField).append(",")
.append(linkTypeField)
.append(" FROM ").append(getTableName()).append(" WHERE ");
sb.append(buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(parentIDHashField,sourceDocumentIDHash),
new MultiClause(linkTypeField,legalLinkTypes)}));
IResultSet set = performQuery(sb.toString(),list,null,null);
Map<String,Answer> answerMap = new HashMap<String,Answer>();
for (int i = 0; i < estimates.length; i++)
{
estimates[i] = new Answer(ANSWER_INFINITY);
answerMap.put(legalLinkTypes[i],estimates[i]);
}
for (int i = 0; i < set.getRowCount(); i++)
{
IResultRow row = set.getRow(i);
Long id = (Long)row.getValue(idField);
DeleteDependency[] dds;
if (hopcountMethod != IJobDescription.HOPCOUNT_NEVERDELETE)
dds = deleteDepsManager.getDeleteDependencies(id);
else
dds = new DeleteDependency[0];
Long distance = (Long)row.getValue(distanceField);
String recordedLinkType = (String)row.getValue(linkTypeField);
Answer a = answerMap.get(recordedLinkType);
int recordedDistance = (int)distance.longValue();
if (recordedDistance != -1)
{
a.setAnswer(recordedDistance,dds);
}
}
}
// Now add these documents to the processing queue
boolean[] hasChanged = addToProcessingQueue(jobID,legalLinkTypes,newReferences,estimates,sourceDocumentIDHash,linkType,hopcountMethod);
// First, note them in return value
Map<String,Boolean> changeMap = new HashMap<String,Boolean>();
for (int i = 0; i < newReferences.length; i++)
{
changeMap.put(newReferences[i],new Boolean(hasChanged[i]));
}
for (int i = 0; i < rval.length; i++)
{
Boolean x = changeMap.get(targetDocumentIDHashes[i]);
if (x != null && x.booleanValue())
rval[i] = true;
}
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done queueing "+Integer.toString(targetDocumentIDHashes.length)+" documents");
}
return rval;
}
/** Remove a set of document identifiers specified as a criteria. This will remove hopcount rows and
* also intrinsic links that have the specified document identifiers as sources.
*/
public void deleteMatchingDocuments(Long jobID, String[] legalLinkTypes,
String joinTableName,
String joinTableIDColumn, String joinTableJobColumn,
String joinTableCriteria, ArrayList joinTableParams,
int hopcountMethod)
throws ManifoldCFException
{
// This should work similarly to deleteDocumentIdentifiers() except that the identifiers
// come from a subquery rather than a list.
// This also removes the links themselves...
if (hopcountMethod == IJobDescription.HOPCOUNT_ACCURATE)
{
doDeleteDocuments(jobID,joinTableName,
joinTableIDColumn,joinTableJobColumn,
joinTableCriteria,joinTableParams);
}
}
/** Remove a set of document identifier hashes. This will also remove the intrinsic links that have these document
* identifier hashes as sources, as well as invalidating cached hop counts that depend on them.
*/
public void deleteDocumentIdentifiers(Long jobID, String[] legalLinkTypes, String[] documentHashes, int hopcountMethod)
throws ManifoldCFException
{
// What I want to do here is to first perform the invalidation of the cached hopcounts.
//
// UPDATE hopcount SET markfordeath='X' WHERE EXISTS(SELECT 'x' FROM hopdeletedeps t0 WHERE t0.ownerid=hopcount.id AND t0.jobid=<jobid>
// AND EXISTS(SELECT 'x' FROM intrinsiclinks t1 WHERE t1.linktype=t0.linktype AND t1.parentid=t0.parentid
// AND t1.childid=t0.childid AND t1.jobid=<jobid> AND t1.childid IN(<sourcedocs>)))
//
// ... and then, re-evaluate all hopcount records and their dependencies that are marked for delete.
//
// This also removes the links themselves...
if (hopcountMethod == IJobDescription.HOPCOUNT_ACCURATE)
doDeleteDocuments(jobID,documentHashes);
}
/** Calculate a bunch of hop-counts. The values returned are only guaranteed to be an upper bound, unless
* the queue has recently been processed (via processQueue below). -1 will be returned to indicate "infinity".
*/
public int[] findHopCounts(Long jobID, String[] parentIdentifierHashes, String linkType)
throws ManifoldCFException
{
// No transaction, since we can happily interpret whatever comes back.
ArrayList list = new ArrayList();
int[] rval = new int[parentIdentifierHashes.length];
HashMap rvalMap = new HashMap();
int i = 0;
while (i < rval.length)
{
rval[i] = -1;
rvalMap.put(parentIdentifierHashes[i],new Integer(i));
i++;
}
int maxClause = maxClauseProcessFind(jobID,linkType);
i = 0;
int k = 0;
while (i < parentIdentifierHashes.length)
{
if (k == maxClause)
{
processFind(rval,rvalMap,jobID,linkType,list);
k = 0;
list.clear();
}
list.add(parentIdentifierHashes[i]);
k++;
i++;
}
if (k > 0)
processFind(rval,rvalMap,jobID,linkType,list);
return rval;
}
/** Find max clause count.
*/
protected int maxClauseProcessFind(Long jobID, String linkType)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(linkTypeField,linkType)});
}
/** Process a portion of a find request for hopcount information.
*/
protected void processFind(int[] rval, Map rvalMap, Long jobID, String linkType, ArrayList list)
throws ManifoldCFException
{
ArrayList newList = new ArrayList();
String query = buildConjunctionClause(newList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(parentIDHashField,list),
new UnitaryClause(linkTypeField,linkType)});
IResultSet set = performQuery("SELECT "+distanceField+","+parentIDHashField+" FROM "+getTableName()+" WHERE "+query,newList,null,null);
int i = 0;
while (i < set.getRowCount())
{
IResultRow row = set.getRow(i++);
String parentIDHash = (String)row.getValue(parentIDHashField);
Long distance = (Long)row.getValue(distanceField);
rval[((Integer)rvalMap.get(parentIDHash)).intValue()] = (int)distance.longValue();
}
}
/** Process a stage of the propagation queue for a job.
*@param jobID is the job we need to have the hopcount propagated for.
*@return true if the queue is empty.
*/
public boolean processQueue(Long jobID, String[] legalLinkTypes, int hopcountMethod)
throws ManifoldCFException
{
// We can't instantiate the DocumentHash object here, because it will wind up having
// cached in it the answers from the previous round of calculation. That round had
// a different set of marked nodes than the current round.
ArrayList list = new ArrayList();
// Pick off up to n queue items at a time. We don't want to pick off too many (because
// then we wind up delaying other threads too much), nor do we want to do one at a time
// (because that is inefficient against the database), so I picked 200 as being 200+x faster
// than 1...
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(markForDeathField,markToString(MARK_QUEUED))});
IResultSet set = performQuery("SELECT "+linkTypeField+","+parentIDHashField+" FROM "+
getTableName()+" WHERE "+query+" "+constructOffsetLimitClause(0,200)+" FOR UPDATE",list,null,null,200);
// No more entries == we are done
if (set.getRowCount() == 0)
return true;
DocumentHash dh = new DocumentHash(jobID,legalLinkTypes,hopcountMethod);
Question[] questions = new Question[set.getRowCount()];
int i = 0;
while (i < set.getRowCount())
{
IResultRow row = set.getRow(i);
String parentIdentifierHash = (String)row.getValue(parentIDHashField);
String linkType = (String)row.getValue(linkTypeField);
// All documents in the set have the same basic assumptions; another set may be queued
// as a side effect of some of these getting resolved, but treating them in chunks
// seems like it should not cause problems (because the same underlying assumptions
// underlie the whole chunk). The side effects *may* cause other documents that are
// still in the queue to be evaluated as well, in which case they will disappear from
// the queue and not be processed further.
// Create a document hash object.
questions[i] = new Question(parentIdentifierHash,linkType);
i++;
}
// We don't care what the response is; we just want the documents to leave the queue.
dh.askQuestions(questions);
return false;
}
/** Calculate max clauses */
protected int maxClausePerformFindMissingRecords(Long jobID, String[] affectedLinkTypes)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(linkTypeField,affectedLinkTypes)});
}
/** Limited find for missing records.
*/
protected void performFindMissingRecords(Long jobID, String[] affectedLinkTypes, ArrayList list, Map<Question,Long> matchMap)
throws ManifoldCFException
{
ArrayList newList = new ArrayList();
String query = buildConjunctionClause(newList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(parentIDHashField,list),
new MultiClause(linkTypeField,affectedLinkTypes)});
// The naive query is this - but postgres does not find the index this way:
//IResultSet set = performQuery("SELECT "+parentIDField+","+linkTypeField+" FROM "+getTableName()+" WHERE "+
// parentIDField+" IN("+query+") AND "+jobIDField+"=?",list,null,null);
IResultSet set = performQuery("SELECT "+parentIDHashField+","+linkTypeField+","+distanceField+" FROM "+getTableName()+" WHERE "+query,newList,null,null);
int i = 0;
while (i < set.getRowCount())
{
IResultRow row = set.getRow(i++);
String docIDHash = (String)row.getValue(parentIDHashField);
String linkType = (String)row.getValue(linkTypeField);
Long distance = (Long)row.getValue(distanceField);
Question q = new Question(docIDHash,linkType);
matchMap.put(q,distance);
}
}
/** Add documents to the processing queue. For the supplied bunch of link types and document ids,
* the corresponding hopcount records will be marked as being queued. If, for example, the affected link types
* are 'link' and 'redirect', and the specified document id's are 'A' and 'B' and 'C', then six hopcount
* rows will be created and/or queued.
* The values that this code uses for initial distance or delete dependencies for each of the hopcount
* rows combinatorially described above are calculated by this method by starting with the passed-in hopcount values
* and dependencies for each of the affectedLinkTypes for the specified "source" document. The result estimates are then
* generated by passing these values and dependencies over the links to the target document identifiers, presuming that
* the link is of the supplied link type.
*
*@param jobID is the job the documents belong to.
*@param affectedLinkTypes are the set of affected link types.
*@param documentIDHashes are the documents to add.
*@param startingAnswers are the hopcounts for the documents as they are currently known.
*@param sourceDocumentIDHash is the source document identifier for the links from source to target documents.
*@param linkType is the link type for this queue addition.
*@param hopcountMethod is the desired method of managing hopcounts.
*@return a boolean array which is the subset of documentIDHashes whose distances may have changed.
*/
protected boolean[] addToProcessingQueue(Long jobID, String[] affectedLinkTypes, String[] documentIDHashes,
Answer[] startingAnswers, String sourceDocumentIDHash, String linkType, int hopcountMethod)
throws ManifoldCFException
{
// If we're given the source hopcount distances, we should write the derived target values into the NEW
// hopcount records we create, because it will save much database access in the long run, and handles the
// typical case in an inexpensive way. These records do not even need to be queued - since we are creating
// them, we know there are no other paths to them yet (or paths that depend upon them). So we can write in
// 'final' values, which will need to be updated only if the source hopcount row's distance is lowered (and
// then, the targets will all be requeued anyhow).
//
// For EXISTING hopcount rows, I've opted to not consider the passed-in distance estimates. Even if I should
// detect that the hopcount has improved, there would still be the requirement of requeuing all the target's
// targets. This kind of propagation is probably best handled by the normal queue processing code, which does
// as much in bulk as is possible. So, for existing target hopcount rows, they simply get queued.
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Adding "+Integer.toString(documentIDHashes.length)+" documents to processing queue");
for (int z = 0; z < documentIDHashes.length; z++)
{
Logging.hopcount.debug(" Adding '"+documentIDHashes[z]+"' to processing queue");
}
Logging.hopcount.debug("The source id is '"+sourceDocumentIDHash+"' and linktype is '"+linkType+"', and there are "+
Integer.toString(affectedLinkTypes.length)+" affected link types, as below:");
for (int z = 0; z < affectedLinkTypes.length; z++)
{
Logging.hopcount.debug(" Linktype '"+affectedLinkTypes[z]+"', current distance "+Integer.toString(startingAnswers[z].getAnswer())+" with "+
Integer.toString(startingAnswers[z].countDeleteDependencies())+" delete dependencies.");
}
}
// If hopcount records for the targets for the links don't yet exist, we had better create them,
// so we can make sure they are added to the queue properly.
// Make a map of the combinations of link type and document id we want to have present
Map<Question,Long> matchMap = new HashMap();
// Make a map from the link type to the corresponding Answer object
Map<String,Answer> answerMap = new HashMap<String,Answer>();
for (int u = 0; u < affectedLinkTypes.length; u++)
{
answerMap.put(affectedLinkTypes[u],startingAnswers[u]);
}
boolean[] rval = new boolean[documentIDHashes.length];
for (int i = 0; i < rval.length; i++)
{
rval[i] = false;
}
// I don't think we have to throw a table lock here, because even though we base decisions for insertion on the lack of existence
// of a record, there can be only one thread in here at a time.
int maxClause = maxClausePerformFindMissingRecords(jobID,affectedLinkTypes);
ArrayList list = new ArrayList();
int k = 0;
for (int i = 0; i < documentIDHashes.length; i++)
{
String documentIDHash = documentIDHashes[i];
if (k == maxClause)
{
performFindMissingRecords(jobID,affectedLinkTypes,list,matchMap);
k = 0;
list.clear();
}
list.add(documentIDHash);
k++;
}
if (k > 0)
performFindMissingRecords(jobID,affectedLinkTypes,list,matchMap);
// Repeat our pass through the documents and legal link types. For each document/legal link type,
// see if there was an existing row. If not, we create a row. If so, we compare the recorded
// distance against the distance estimate we would have given it. If the new distance is LOWER, it gets left around
// for queuing.
HashMap map = new HashMap();
for (int i = 0; i < documentIDHashes.length; i++)
{
String documentIDHash = documentIDHashes[i];
for (int j = 0; j < affectedLinkTypes.length; j++)
{
String affectedLinkType = affectedLinkTypes[j];
Question q = new Question(documentIDHash,affectedLinkType);
// Calculate what our new answer would be.
Answer startingAnswer = (Answer)answerMap.get(affectedLinkType);
int newAnswerValue = startingAnswer.getAnswer();
if (newAnswerValue >= 0 && affectedLinkType.equals(linkType))
newAnswerValue++;
// Now, see if there's a distance already present.
Long currentDistance = (Long)matchMap.get(q);
if (currentDistance == null)
{
// Prepare to do an insert.
// The dependencies are the old dependencies, plus the one we are about to add.
DeleteDependency dd = new DeleteDependency(linkType,documentIDHash,sourceDocumentIDHash);
// Build a new answer, based on the starting answer and the kind of link this is.
map.clear();
Long hopCountID = new Long(IDFactory.make(threadContext));
map.put(idField,hopCountID);
map.put(parentIDHashField,q.getDocumentIdentifierHash());
map.put(linkTypeField,q.getLinkType());
if (newAnswerValue == ANSWER_INFINITY)
map.put(distanceField,new Long(-1L));
else
map.put(distanceField,new Long((long)newAnswerValue));
map.put(jobIDField,jobID);
map.put(markForDeathField,markToString(MARK_NORMAL));
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Inserting new record for '"+documentIDHash+"' linktype '"+affectedLinkType+"' distance "+Integer.toString(newAnswerValue)+" for job "+jobID);
performInsert(map,null);
noteModifications(1,0,0);
if (hopcountMethod != IJobDescription.HOPCOUNT_NEVERDELETE)
{
deleteDepsManager.writeDependency(hopCountID,jobID,dd);
Iterator iter2 = startingAnswer.getDeleteDependencies();
while (iter2.hasNext())
{
dd = (DeleteDependency)iter2.next();
deleteDepsManager.writeDependency(hopCountID,jobID,dd);
}
}
}
else
{
// If the new distance >= saved distance, don't queue anything. That means, remove it from the hash.
int oldAnswerValue = (int)currentDistance.longValue();
if (!(newAnswerValue >= 0 && (oldAnswerValue < 0 || newAnswerValue < oldAnswerValue)))
{
// New answer is no better than the old answer, so don't queue
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Existing record for '"+documentIDHash+"' linktype '"+affectedLinkType+"' has better distance "+Integer.toString(oldAnswerValue)+
" than new distance "+Integer.toString(newAnswerValue)+", so not queuing for job "+jobID);
matchMap.remove(q);
}
else
rval[i] = true;
}
}
}
// For all the records still in the matchmap, queue them.
// The query I want to run is:
// UPDATE hopcount SET markfordeath='Q' WHERE jobID=? AND parentid IN (...)
// but postgresql is stupid and won't use the index that way. So do this instead:
// UPDATE hopcount SET markfordeath='Q' WHERE (jobID=? AND parentid=?) OR (jobid=? AND parentid=?)...
maxClause = getMaxOrClause();
StringBuilder sb = new StringBuilder();
list = new ArrayList();
k = 0;
for (int i = 0; i < documentIDHashes.length; i++)
{
String documentIDHash = documentIDHashes[i];
for (int j = 0; j < affectedLinkTypes.length; j++)
{
String affectedLinkType = affectedLinkTypes[j];
Question q = new Question(documentIDHash,affectedLinkType);
if (matchMap.get(q) != null)
{
if (k == maxClause)
{
performMarkAddDeps(sb.toString(),list);
k = 0;
sb.setLength(0);
list.clear();
}
if (k > 0)
sb.append(" OR ");
// We only want to queue up hopcount records that correspond to the affected link types.
//
// Also, to reduce deadlock, do not update any records that are already marked as queued. These would be infrequent,
// but they nevertheless seem to cause deadlock very easily.
//
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Queuing '"+documentIDHash+"' linktype '"+affectedLinkType+"' for job "+jobID);
sb.append(buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new MultiClause(markForDeathField,new Object[]{
markToString(MARK_NORMAL),
markToString(MARK_DELETING)}),
new UnitaryClause(parentIDHashField,documentIDHash),
new UnitaryClause(linkTypeField,affectedLinkType)}));
k++;
}
}
}
if (k > 0)
performMarkAddDeps(sb.toString(),list);
// Leave the dependency records for the queued rows. This will save lots of work if we decide not to
// update the distance. It's safe to leave the old dep records, because they must only record links that furnish
// A minimal path, not THE minimal path.
noteModifications(0,documentIDHashes.length,0);
return rval;
}
/** Do the work of marking add-dep-dependent links in the hopcount table. */
protected void performMarkAddDeps(String query, ArrayList list)
throws ManifoldCFException
{
HashMap map = new HashMap();
map.put(markForDeathField,markToString(MARK_QUEUED));
performUpdate(map,"WHERE "+query,list,null);
}
/** Method that does the work of "finishing" a set of child references. */
protected void doFinish(Long jobID, String[] legalLinkTypes, String[] sourceDocumentHashes, int hopcountMethod)
throws ManifoldCFException
{
if (hopcountMethod == IJobDescription.HOPCOUNT_ACCURATE)
{
// First, blow the cache.
//
// To do this, I'd the following queries to occur:
//
// UPDATE hopcount SET markfordeath='Q' WHERE EXISTS(SELECT 'x' FROM hopdeletedeps t0 WHERE t0.ownerid=hopcount.id AND t0.jobid=<jobid>
// AND EXISTS(SELECT 'x' FROM intrinsiclinks t1 WHERE t1.linktype=t0.linktype AND t1.parentid=t0.parentid
// AND t1.childid=t0.childid AND t1.jobid=<jobid> AND t1.isnew=<base> AND t1.childid IN(<sourcedocs>)))
//
// ... and then, get rid of all hopcount records and their dependencies that are marked for delete.
// Invalidate all links with the given source documents that match the common expression
doDeleteInvalidation(jobID,sourceDocumentHashes);
}
// Make all new and existing links become just "base" again.
intrinsicLinkManager.restoreLinks(jobID,sourceDocumentHashes);
}
/** Invalidate links that start with a specific set of documents, described by
* a table join.
*/
protected void doDeleteDocuments(Long jobID,
String joinTableName,
String joinTableIDColumn, String joinTableJobColumn,
String joinTableCriteria, ArrayList joinTableParams)
throws ManifoldCFException
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Marking for delete for job "+jobID+" all hopcount document references"+
" from table "+joinTableName+" matching "+joinTableCriteria);
}
// For this query, postgresql seems to not do the right thing unless the subclause is a three-way join:
//
// UPDATE hopcount SET x=y WHERE id IN(SELECT t0.ownerid FROM hopdeletedeps t0,jobqueue t99,intrinsiclink t1 WHERE
// t0.jobid=? and t99.jobid=? and t1.jobid=? and
// t0.childidhash=t99.dochash and t0.childid=t99.docid and t99.status='P' and
// t0.parentidhash=t1.parentidhash and t0.childidhash=t1.childidhash and t0.linktype=t1.linktype and
// t0.parentid=t1.parentid and t0.childid=t1.childid)
// MHL to figure out the "correct" way to state this for all databases
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList list = new ArrayList();
sb.append(idField).append(" IN(SELECT t0.").append(deleteDepsManager.ownerIDField).append(" FROM ")
.append(deleteDepsManager.getTableName()).append(" t0,").append(joinTableName).append(",")
.append(intrinsicLinkManager.getTableName()).append(" t1 WHERE ");
sb.append(buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause("t0."+deleteDepsManager.jobIDField,jobID)})).append(" AND ");
sb.append(buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause("t1."+intrinsicLinkManager.jobIDField,jobID),
new JoinClause("t1."+intrinsicLinkManager.parentIDHashField,"t0."+deleteDepsManager.parentIDHashField),
new JoinClause("t1."+intrinsicLinkManager.linkTypeField,"t0."+deleteDepsManager.linkTypeField),
new JoinClause("t1."+intrinsicLinkManager.childIDHashField,"t0."+deleteDepsManager.childIDHashField)})).append(" AND ");
sb.append(buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(joinTableJobColumn,jobID),
new JoinClause(joinTableIDColumn,"t0."+deleteDepsManager.childIDHashField)})).append(" AND ");
sb.append(joinTableCriteria);
list.addAll(joinTableParams);
sb.append(")");
HashMap map = new HashMap();
// These are whacked back to "infinity" to avoid infinite looping in a cut-off graph.
map.put(distanceField,new Long(-1L));
map.put(markForDeathField,markToString(MARK_DELETING));
performUpdate(map,sb.toString(),list,null);
noteModifications(0,1,0);
// We do NOT do the parentID because otherwise we have the potential to delete links that we need later. See CONNECTORS-501.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done setting hopcount rows for job "+jobID+" to initial distances");
// Remove the intrinsic links that we said we would - BEFORE we evaluate the queue.
intrinsicLinkManager.removeDocumentLinks(jobID,
joinTableName,
joinTableIDColumn,joinTableJobColumn,
joinTableCriteria,joinTableParams);
// Remove the delete dependencies of the nodes marked as being queued, with distance infinity.
ArrayList queryList = new ArrayList();
String query = buildConjunctionClause(queryList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(markForDeathField,markToString(MARK_DELETING))});
deleteDepsManager.removeMarkedRows(getTableName(),idField,query,queryList);
// Set the hopcount rows back to just "queued".
HashMap newMap = new HashMap();
newMap.put(markForDeathField,markToString(MARK_QUEUED));
performUpdate(newMap,"WHERE "+query,queryList,null);
// At this point, we have a queue that contains all the hopcount entries that our dependencies told us
// needed to change as a result of the deletions. Evaluating the queue will clean up hopcount entries
// and dependencies that are just going away, as well as updating those that are still around but
// will have new hopcount values.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done queueing for deletion for "+jobID);
}
/** Invalidate links that start with a specific set of documents.
*/
protected void doDeleteDocuments(Long jobID,
String[] documentHashes)
throws ManifoldCFException
{
// Clear up hopcount table
if (documentHashes.length > 0)
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Marking for delete for job "+jobID+" all hopcount document references"+
" from:");
for (int k = 0; k < documentHashes.length; k++)
{
Logging.hopcount.debug(" "+documentHashes[k]);
}
}
// The query form I found that seems to work ok with postgresql looks like this:
//
// UPDATE hopcount SET x=y WHERE id IN (SELECT ownerid FROM hopdeletedeps t0
// WHERE ((t0.jobid=? AND t0.childid=?)
// OR (t0.jobid=? AND t0.childid=?)
// ...
// OR (t0.jobid=? AND t0.childid=?))
// AND EXISTS(SELECT 'x' FROM intrinsiclink t1 WHERE t1.linktype=t0.linktype
// AND t1.parentid=t0.parentid AND t1.childid=t0.childid AND t1.jobid=t0.jobid AND t1.isnew='B'))
//
// Here's a revised form that would take advantage of postgres's better ability to work with joins, if this should
// turn out to be necessary:
//
// UPDATE hopcount SET x=y WHERE id IN (SELECT t0.ownerid FROM hopdeletedeps t0, intrinsiclink t1
// WHERE t1.childidhash=t0.childidhash AND t1.jobid=? AND t1.linktype=t0.linktype AND t1.parentid=t0.parentid AND t1.childid=t0.childid AND t1.isnew='B'
// AND ((t0.jobid=? AND t0.childidhash=? AND t0.childid=?)
// OR (t0.jobid=? AND t0.childidhash=? AND t0.childid=?)
// ...
// OR (t0.jobid=? AND t0.childidhash=? AND t0.childid=?))
int maxClause = maxClauseMarkForDocumentDelete(jobID);
ArrayList list = new ArrayList();
int i = 0;
int k = 0;
while (i < documentHashes.length)
{
if (k == maxClause)
{
markForDocumentDelete(jobID,list);
list.clear();
k = 0;
}
list.add(documentHashes[i]);
i++;
k++;
}
if (k > 0)
markForDocumentDelete(jobID,list);
noteModifications(0,documentHashes.length,0);
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done setting hopcount rows for job "+jobID+" to initial distances");
// Remove the intrinsic links that we said we would - BEFORE we evaluate the queue.
intrinsicLinkManager.removeDocumentLinks(jobID,
documentHashes);
// Remove the delete dependencies of the nodes marked as being queued, with distance infinity.
ArrayList queryList = new ArrayList();
String query = buildConjunctionClause(queryList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(markForDeathField,markToString(MARK_DELETING))});
deleteDepsManager.removeMarkedRows(getTableName(),idField,query,queryList);
// Set the hopcount rows back to just "queued".
HashMap newMap = new HashMap();
newMap.put(markForDeathField,markToString(MARK_QUEUED));
performUpdate(newMap,"WHERE "+query,queryList,null);
// At this point, we have a queue that contains all the hopcount entries that our dependencies told us
// needed to change as a result of the deletions. Evaluating the queue will clean up hopcount entries
// and dependencies that are just going away, as well as updating those that are still around but
// will have new hopcount values.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done queueing for deletion for "+jobID);
}
}
protected int maxClauseMarkForDocumentDelete(Long jobID)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause("t0."+deleteDepsManager.jobIDField,jobID)});
}
protected void markForDocumentDelete(Long jobID, ArrayList list)
throws ManifoldCFException
{
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList thisList = new ArrayList();
sb.append(idField).append(" IN(SELECT ").append(deleteDepsManager.ownerIDField).append(" FROM ").append(deleteDepsManager.getTableName()).append(" t0 WHERE ")
.append(buildConjunctionClause(thisList,new ClauseDescription[]{
new UnitaryClause("t0."+deleteDepsManager.jobIDField,jobID),
new MultiClause("t0."+deleteDepsManager.childIDHashField,list)})).append(" AND ");
sb.append("EXISTS(SELECT 'x' FROM ").append(intrinsicLinkManager.getTableName()).append(" t1 WHERE ")
.append(buildConjunctionClause(thisList,new ClauseDescription[]{
new JoinClause("t1."+intrinsicLinkManager.jobIDField,"t0."+deleteDepsManager.jobIDField),
new JoinClause("t1."+intrinsicLinkManager.linkTypeField,"t0."+deleteDepsManager.linkTypeField),
new JoinClause("t1."+intrinsicLinkManager.parentIDHashField,"t0."+deleteDepsManager.parentIDHashField),
new JoinClause("t1."+intrinsicLinkManager.childIDHashField,"t0."+deleteDepsManager.childIDHashField)}));
sb.append("))");
HashMap map = new HashMap();
// These are whacked back to "infinity" to avoid infinite looping in a cut-off graph.
map.put(distanceField,new Long(-1L));
map.put(markForDeathField,markToString(MARK_DELETING));
performUpdate(map,sb.toString(),thisList,null);
// We do NOT do the parentID because we need to leave intrinsic links around that could be used again.
// See CONNECTORS-501.
}
/** Invalidate links meeting a simple criteria which have a given set of source documents. This also runs a queue
* which is initialized with all the documents that have sources that exist in the hopcount table. The purpose
* of that queue is to re-establish non-infinite values for all nodes that are described in IntrinsicLinks, that are
* still connected to the root. */
protected void doDeleteInvalidation(Long jobID,
String[] sourceDocumentHashes)
throws ManifoldCFException
{
ArrayList commonNewList = new ArrayList();
commonNewList.add(intrinsicLinkManager.statusToString(intrinsicLinkManager.LINKSTATUS_BASE));
String commonNewExpression = intrinsicLinkManager.newField+"=?";
// Clear up hopcount table
if (sourceDocumentHashes.length > 0)
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Marking for delete for job "+jobID+" all target document references matching '"+commonNewExpression+"'"+
" from:");
for (int k = 0; k < sourceDocumentHashes.length; k++)
{
Logging.hopcount.debug(" "+sourceDocumentHashes[k]);
}
}
// The query form I found that seems to work ok with postgresql looks like this:
//
// UPDATE hopcount SET x=y WHERE id IN (SELECT ownerid FROM hopdeletedeps t0
// WHERE ((t0.jobid=? AND t0.childid=?)
// OR (t0.jobid=? AND t0.childid=?)
// ...
// OR (t0.jobid=? AND t0.childid=?))
// AND EXISTS(SELECT 'x' FROM intrinsiclink t1 WHERE t1.linktype=t0.linktype
// AND t1.parentid=t0.parentid AND t1.childid=t0.childid AND t1.jobid=t0.jobid AND t1.isnew='B'))
//
// Here's a revised form that would take advantage of postgres's better ability to work with joins, if this should
// turn out to be necessary:
//
// UPDATE hopcount SET x=y WHERE id IN (SELECT t0.ownerid FROM hopdeletedeps t0, intrinsiclink t1
// WHERE t1.childidhash=t0.childidhash AND t1.jobid=? AND t1.linktype=t0.linktype AND t1.parentid=t0.parentid AND t1.childid=t0.childid AND t1.isnew='B'
// AND ((t0.jobid=? AND t0.childidhash=? AND t0.childid=?)
// OR (t0.jobid=? AND t0.childidhash=? AND t0.childid=?)
// ...
// OR (t0.jobid=? AND t0.childidhash=? AND t0.childid=?))
int maxClause = maxClauseMarkForDelete(jobID);
ArrayList list = new ArrayList();
int i = 0;
int k = 0;
while (i < sourceDocumentHashes.length)
{
if (k == maxClause)
{
markForDelete(jobID,list,commonNewExpression,commonNewList);
list.clear();
k = 0;
}
list.add(sourceDocumentHashes[i]);
i++;
k++;
}
if (k > 0)
markForDelete(jobID,list,commonNewExpression,commonNewList);
noteModifications(0,sourceDocumentHashes.length,0);
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done setting hopcount rows for job "+jobID+" to initial distances");
// Remove the intrinsic links that we said we would - BEFORE we evaluate the queue.
intrinsicLinkManager.removeLinks(jobID,
commonNewExpression,commonNewList,
sourceDocumentHashes);
// Remove the delete dependencies of the nodes marked as being queued, with distance infinity.
ArrayList queryList = new ArrayList();
String query = buildConjunctionClause(queryList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(markForDeathField,markToString(MARK_DELETING))});
deleteDepsManager.removeMarkedRows(getTableName(),idField,query,queryList);
// Set the hopcount rows back to just "queued".
HashMap newMap = new HashMap();
newMap.put(markForDeathField,markToString(MARK_QUEUED));
performUpdate(newMap,"WHERE "+query,queryList,null);
// At this point, we have a queue that contains all the hopcount entries that our dependencies told us
// needed to change as a result of the deletions. Evaluating the queue will clean up hopcount entries
// and dependencies that are just going away, as well as updating those that are still around but
// will have new hopcount values.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Done queueing for deletion for "+jobID);
}
}
protected int maxClauseMarkForDelete(Long jobID)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause("t0."+deleteDepsManager.jobIDField,jobID)});
}
protected void markForDelete(Long jobID, ArrayList list, String commonNewExpression, ArrayList commonNewList)
throws ManifoldCFException
{
StringBuilder sb = new StringBuilder("WHERE ");
ArrayList thisList = new ArrayList();
sb.append(idField).append(" IN(SELECT ").append(deleteDepsManager.ownerIDField).append(" FROM ").append(deleteDepsManager.getTableName()).append(" t0 WHERE ")
.append(buildConjunctionClause(thisList,new ClauseDescription[]{
new UnitaryClause("t0."+deleteDepsManager.jobIDField,jobID),
new MultiClause("t0."+deleteDepsManager.childIDHashField,list)})).append(" AND ");
sb.append("EXISTS(SELECT 'x' FROM ").append(intrinsicLinkManager.getTableName()).append(" t1 WHERE ")
.append(buildConjunctionClause(thisList,new ClauseDescription[]{
new JoinClause("t1."+intrinsicLinkManager.jobIDField,"t0."+deleteDepsManager.jobIDField),
new JoinClause("t1."+intrinsicLinkManager.linkTypeField,"t0."+deleteDepsManager.linkTypeField),
new JoinClause("t1."+intrinsicLinkManager.parentIDHashField,"t0."+deleteDepsManager.parentIDHashField),
new JoinClause("t1."+intrinsicLinkManager.childIDHashField,"t0."+deleteDepsManager.childIDHashField)}));
if (commonNewExpression != null)
{
sb.append(" AND t1.").append(commonNewExpression);
thisList.addAll(commonNewList);
}
sb.append("))");
HashMap map = new HashMap();
// These are whacked back to "infinity" to avoid infinite looping in a cut-off graph.
map.put(distanceField,new Long(-1L));
map.put(markForDeathField,markToString(MARK_DELETING));
performUpdate(map,sb.toString(),thisList,null);
}
/** Get document's children.
*@return rows that contain the children. Column names are 'linktype','childidentifier'.
*/
protected IResultSet getDocumentChildren(Long jobID, String documentIDHash)
throws ManifoldCFException
{
return intrinsicLinkManager.getDocumentChildren(jobID,documentIDHash);
}
/** Find the cached distance from a set of identifiers to the root.
* This is tricky, because if there is a queue assessment going on, some values are not valid.
* In general, one would treat a missing record as meaning "infinity". But if the missing record
* is simply invalidated at the moment, we want it to be treated as "missing". So... we pick up
* the record despite it potentially being marked, and we then examine the mark to figure out
* what to do.
*@return the corresponding list of nodes, taking into account unknown distances.
*/
protected DocumentNode[] readCachedNodes(Long jobID, Question[] unansweredQuestions)
throws ManifoldCFException
{
// We should not ever get requests that are duplications, or are not germane (e.g.
// for the root).
DocumentNode[] rval = new DocumentNode[unansweredQuestions.length];
// Set the node up as being "infinity" first; we'll change it around later
Answer a = new Answer(ANSWER_INFINITY);
Map indexMap = new HashMap();
int i = 0;
while (i < unansweredQuestions.length)
{
indexMap.put(unansweredQuestions[i],new Integer(i));
// If we wind up deleting a row in the hopcount table, because it's distance is infinity,
// we need to treat that here as loading a node with ANSWER_INFINITY as the value. Right
// now, we load UNKNOWN in this case, which is wrong.
//
// The way in which this deletion occurs is that nodes get marked BEFORE the intrinsic link goes
// away (supposedly), and then the intrinsic link(s) are removed. Plus, all possible nodes are not
// added in this case. Therefore, we should expect questions pertaining to nodes that don't exist
// to work.
DocumentNode dn = new DocumentNode(unansweredQuestions[i]);
rval[i] = dn;
// Make the node "complete", since we found a legit value.
dn.setStartingAnswer(a);
dn.setTrialAnswer(a);
// Leave bestPossibleAnswer alone. It's not used after node is marked complete.
dn.makeCompleteNoWrite();
i++;
}
// Accumulate the ids of rows where I need deps too. This is keyed by id and has the right answer object as a value.
Map depsMap = new HashMap();
int maxClause = maxClausePerformGetCachedDistances(jobID);
ArrayList list = new ArrayList();
ArrayList ltList = new ArrayList();
i = 0;
int k = 0;
while (i < unansweredQuestions.length)
{
if (k == maxClause)
{
performGetCachedDistances(rval,indexMap,depsMap,jobID,ltList,list);
k = 0;
list.clear();
ltList.clear();
}
Question q = unansweredQuestions[i++];
ltList.add(q.getLinkType());
list.add(q.getDocumentIdentifierHash());
k++;
}
if (k > 0)
performGetCachedDistances(rval,indexMap,depsMap,jobID,ltList,list);
// Now, find the required delete dependencies too.
maxClause = maxClausePerformGetCachedDistanceDeps();
list.clear();
k = 0;
Iterator iter = depsMap.keySet().iterator();
while (iter.hasNext())
{
Long id = (Long)iter.next();
if (k == maxClause)
{
performGetCachedDistanceDeps(depsMap,list);
k = 0;
list.clear();
}
list.add(id);
k++;
}
if (k > 0)
performGetCachedDistanceDeps(depsMap,list);
return rval;
}
protected int maxClausePerformGetCachedDistanceDeps()
{
return findConjunctionClauseMax(new ClauseDescription[]{});
}
/** Do a limited fetch of cached distance dependencies */
protected void performGetCachedDistanceDeps(Map depsMap, ArrayList list)
throws ManifoldCFException
{
ArrayList newList = new ArrayList();
String query = buildConjunctionClause(newList,new ClauseDescription[]{
new MultiClause(deleteDepsManager.ownerIDField,list)});
IResultSet set = performQuery("SELECT "+deleteDepsManager.ownerIDField+","+
deleteDepsManager.linkTypeField+","+
deleteDepsManager.parentIDHashField+","+
deleteDepsManager.childIDHashField+" FROM "+deleteDepsManager.getTableName()+
" WHERE "+query,newList,null,null);
// Each dependency needs to be filed by owner id, so let's populate a hash. The
// hash will be keyed by owner id and contain an arraylist of deletedependency
// objects.
HashMap ownerHash = new HashMap();
int i = 0;
while (i < set.getRowCount())
{
IResultRow row = set.getRow(i++);
Long ownerID = (Long)row.getValue(deleteDepsManager.ownerIDField);
String linkType = (String)row.getValue(deleteDepsManager.linkTypeField);
if (linkType == null)
linkType = "";
String parentIDHash = (String)row.getValue(deleteDepsManager.parentIDHashField);
String childIDHash = (String)row.getValue(deleteDepsManager.childIDHashField);
if (childIDHash == null)
childIDHash = "";
DeleteDependency dd = new DeleteDependency(linkType,parentIDHash,childIDHash);
ArrayList ddlist = (ArrayList)ownerHash.get(ownerID);
if (ddlist == null)
{
ddlist = new ArrayList();
ownerHash.put(ownerID,ddlist);
}
ddlist.add(dd);
}
// Now, for each owner, populate the dependencies in the answer
Iterator iter = ownerHash.keySet().iterator();
while (iter.hasNext())
{
Long owner = (Long)iter.next();
ArrayList ddlist = (ArrayList)ownerHash.get(owner);
if (ddlist != null)
{
DocumentNode dn = (DocumentNode)depsMap.get(owner);
DeleteDependency[] array = new DeleteDependency[ddlist.size()];
int j = 0;
while (j < array.length)
{
array[j] = (DeleteDependency)ddlist.get(j);
j++;
}
// In the DocumentNode's created earlier, the starting answer and trial answer refer
// to the same answer object, so fooling
// with it will set both values, just as we want.
Answer a = dn.getStartingAnswer();
dn.setStartingAnswer(new Answer(a.getAnswer(),array));
a = dn.getTrialAnswer();
dn.setTrialAnswer(new Answer(a.getAnswer(),array));
}
}
}
/** Calculate the max clauses.
*/
protected int maxClausePerformGetCachedDistances(Long jobID)
{
// Always OR clauses, so it's maxORClause.
return getMaxOrClause();
}
/** Do a limited fetch of cached distances */
protected void performGetCachedDistances(DocumentNode[] rval, Map indexMap, Map depsMap, Long jobID, ArrayList ltList, ArrayList list)
throws ManifoldCFException
{
ArrayList newList = new ArrayList();
StringBuilder sb = new StringBuilder();
for (int i = 0 ; i < list.size() ; i++)
{
if (i > 0)
sb.append(" OR ");
sb.append(buildConjunctionClause(newList,new ClauseDescription[]{
new UnitaryClause(jobIDField,jobID),
new UnitaryClause(parentIDHashField,list.get(i)),
new UnitaryClause(linkTypeField,ltList.get(i))}));
}
String query = sb.toString();
IResultSet set = performQuery("SELECT "+idField+","+parentIDHashField+","+linkTypeField+","+distanceField+","+markForDeathField+
" FROM "+getTableName()+" WHERE "+query,newList,null,null);
// Go through results and create answers
int i = 0;
while (i < set.getRowCount())
{
IResultRow row = set.getRow(i++);
String parentIDHash = (String)row.getValue(parentIDHashField);
String linkType = (String)row.getValue(linkTypeField);
Question q = new Question(parentIDHash,linkType);
Long id = (Long)row.getValue(idField);
Long distance = (Long)row.getValue(distanceField);
int answerDistance;
if (distance.longValue() == -1L)
answerDistance = ANSWER_INFINITY;
else
answerDistance = (int)distance.longValue();
DocumentNode dn = rval[((Integer)indexMap.get(q)).intValue()];
// If the record is marked, don't use it's value; we'll look at it again on write.
// Get the mark.
int foundMark = stringToMark((String)row.getValue(markForDeathField));
if (foundMark != MARK_NORMAL)
{
if (foundMark == MARK_QUEUED)
{
// The record has been disabled because it's on the queue.
// We treat this as 'unknown value'.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("For '"+parentIDHash+"' linktype '"+linkType+"', the record is marked: returned 'unknown'");
// Reset the node to be "unknown" and "incomplete"
dn.reset();
// Leave the document node as-is (unknown), except set the source information.
dn.setSource(id,answerDistance);
continue;
}
else
{
Logging.hopcount.error("Document '"+parentIDHash+"' linktype '"+linkType+"' is labeled with 'DELETING'!");
throw new ManifoldCFException("Algorithm transaction error!");
}
}
// Initially the returned answer has no dependencies. We'll add the dependencies later.
if (answerDistance != ANSWER_INFINITY)
{
// Need the dependencies for anything better than infinity
depsMap.put(id,dn);
}
// Make the node "complete", since we found a legit value.
dn.setStartingAnswer(new Answer(answerDistance));
dn.setTrialAnswer(new Answer(answerDistance));
// Leave bestPossibleAnswer alone. It's not used after node is marked complete.
dn.makeCompleteNoWrite();
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("For '"+parentIDHash+"' linktype '"+linkType+"', the value returned is "+Integer.toString(dn.getFinalAnswer()));
}
}
/** Write a distance into the cache.
*/
protected void writeCachedDistance(Long jobID, String[] legalLinkTypes, DocumentNode dn, int hopcountMethod)
throws ManifoldCFException
{
Question q = dn.getQuestion();
String linkType = q.getLinkType();
String parentIDHash = q.getDocumentIdentifierHash();
Answer answer = dn.getTrialAnswer();
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Deciding whether to cache answer for document '"+parentIDHash+"' linktype '"+linkType+"' answer="+Integer.toString(answer.getAnswer()));
int answerValue = answer.getAnswer();
if (answerValue < 0 && answerValue != ANSWER_INFINITY)
return;
// Write cached distance and dependencies, all together.
// Yeah, this is expected to take place in a larger transaction, but I've bracketed necessary atomicity here
// also in case later we want to call this in another way.
HashMap map = new HashMap();
Iterator iter;
// Find the existing record
int existingDistance = dn.getDatabaseValue();
Long existingID = dn.getDatabaseRow();
if (existingID != null)
{
// If we find a cached distance here, it will be marked with the same value as is passed in.
// The algorithm makes us compare values in that case. If the new value is LESS than the current
// value, we must throw all the target documents of this node onto the queue.
// If the new answer is "infinity", delete the old record too.
if (answerValue == ANSWER_INFINITY)
{
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Caching infinity for document '"+parentIDHash+"' linktype '"+linkType+"' answer="+Integer.toString(answer.getAnswer()));
// Delete the old dependencies in any case.
deleteDepsManager.deleteOwnerRows(new Long[]{existingID});
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(idField,existingID)});
performDelete("WHERE "+query,list,null);
noteModifications(0,0,1);
// Since infinity is not a reduction of any kind, we're done here.
return;
}
// It should not be possible for an existing value to be better than the new value,
// because the way we get rid of links should clean up all questionable existing values.
if (existingDistance != ANSWER_INFINITY && existingDistance < answerValue)
{
Logging.hopcount.error("Existing distance "+Integer.toString(existingDistance)+" better than new distance "+
Integer.toString(answerValue)+" for '"+parentIDHash+"' linktype '"+linkType+"'");
throw new ManifoldCFException("Existing distance is better than new distance! Failure.");
}
// If the new distance is exactly the same as the old, we can leave everything as is.
// If the distance has improved, then push target documents onto the queue.
// Use the intrinsic link table for this.
if (existingDistance == ANSWER_INFINITY || existingDistance > answerValue)
{
// Update existing row, and write new delete dependencies.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Updating answer for document '"+parentIDHash+"' linktype '"+linkType+"' answer="+Integer.toString(answer.getAnswer()));
// We need to make sure the delete deps agree with what we have in mind.
// This is currently the most expensive part of propagating lots of changes, because most of the nodes
// have numerous delete dependencies. I therefore reorganized this code to be incremental where it makes
// sense to be. This could cut back on the number of required operations significantly.
HashMap existingDepsMap = new HashMap();
if (hopcountMethod != IJobDescription.HOPCOUNT_NEVERDELETE)
{
// If we knew in advance which nodes we'd be writing, we could have read the old
// delete deps when we read the old distance value, in one largish query per some 25 nodes.
// But we don't know in advance, so it's not clear whether we'd win or lose by such a strategy.
//
// In any case, I do believe that it will be rare for wholesale changes to occur to these dependencies,
// so I've chosen to optimize by reading the old dependencies and just writing out the deltas.
DeleteDependency[] existingDeps = deleteDepsManager.getDeleteDependencies(existingID);
/* This code demonstrated that once in a while Postgresql forgets to inherit the isolation level properly. I wound up disabling nested transactions inside
serializable transactions as a result, in DBInterfacePostgresql.
IResultSet set = performQuery("SHOW TRANSACTION ISOLATION LEVEL",null, null,null);
if (set.getRowCount() != 1)
throw new ManifoldCFException("Unexpected return: no rows");
IResultRow row = set.getRow(0);
if (row.getColumnCount() != 1)
throw new ManifoldCFException("Unexpected return: no columns");
Iterator itera = row.getColumns();
String columnName = (String)itera.next();
if (row.getValue(columnName).toString().indexOf("serializ") == -1)
throw new ManifoldCFException("Not in a serializable transaction! "+row.getValue(columnName).toString());
*/
// Drop these into a hash map.
int k = 0;
while (k < existingDeps.length)
{
DeleteDependency dep = existingDeps[k++];
existingDepsMap.put(dep,dep);
}
}
map.put(distanceField,new Long((long)answerValue));
map.put(markForDeathField,markToString(MARK_NORMAL));
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(idField,existingID)});
performUpdate(map,"WHERE "+query,list,null);
noteModifications(0,1,0);
if (hopcountMethod != IJobDescription.HOPCOUNT_NEVERDELETE)
{
// Write either dependencies, or dependency deltas
int incrementalOpCount = 0;
iter = existingDepsMap.keySet().iterator();
while (iter.hasNext())
{
DeleteDependency dep = (DeleteDependency)iter.next();
if (answer.hasDependency(dep) == false)
incrementalOpCount++;
}
iter = answer.getDeleteDependencies();
while (iter.hasNext())
{
DeleteDependency dep = (DeleteDependency)iter.next();
if (existingDepsMap.get(dep) == null)
incrementalOpCount++;
}
if (incrementalOpCount > 1 + answer.countDeleteDependencies())
{
deleteDepsManager.deleteOwnerRows(new Long[]{existingID});
existingDepsMap.clear();
}
// Write the individual deletes...
iter = existingDepsMap.keySet().iterator();
while (iter.hasNext())
{
DeleteDependency dep = (DeleteDependency)iter.next();
if (answer.hasDependency(dep) == false)
deleteDepsManager.deleteDependency(existingID,dep);
}
// Then, inserts...
iter = answer.getDeleteDependencies();
while (iter.hasNext())
{
DeleteDependency dep = (DeleteDependency)iter.next();
if (existingDepsMap.get(dep) == null)
deleteDepsManager.writeDependency(existingID,jobID,dep);
}
}
String[] targetDocumentIDHashes = intrinsicLinkManager.getDocumentUniqueParents(jobID,parentIDHash);
// Push the target documents onto the queue!
// It makes sense to drop in a maximal estimate of the hopcount when we do this queuing,
// because that estimate may well be low enough so that the true hopcount value doesn't
// need to be calculated for a time. So, calculate an estimate and pass it in.
// The estimate will by definition be larger than the final value.
addToProcessingQueue(jobID,new String[]{linkType},targetDocumentIDHashes,new Answer[]{answer},parentIDHash,linkType,hopcountMethod);
}
else
{
// Take the row off the queue.
map.put(markForDeathField,markToString(MARK_NORMAL));
ArrayList list = new ArrayList();
String query = buildConjunctionClause(list,new ClauseDescription[]{
new UnitaryClause(idField,existingID)});
performUpdate(map,"WHERE "+query,list,null);
noteModifications(0,1,0);
}
// Done
return;
}
// The logic for dealing with "infinity" is that we need to remove such records from the table,
// in order to keep the table from growing forever.
if (answerValue == ANSWER_INFINITY)
{
// There is nothing currently recorded, so just exit.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Caching infinity for document '"+parentIDHash+"' linktype '"+linkType+"' answer="+Integer.toString(answer.getAnswer()));
return;
}
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Caching answer for document '"+parentIDHash+"' linktype '"+linkType+"' answer="+Integer.toString(answer.getAnswer()));
// We do NOT expect there to already be a cached entry! If there is, we've screwed
// up somehow, and it's a bug.
Long id = new Long(IDFactory.make(threadContext));
map.put(idField,id);
map.put(jobIDField,jobID);
if (linkType.length() > 0)
map.put(linkTypeField,linkType);
map.put(parentIDHashField,parentIDHash);
map.put(distanceField,new Long(answer.getAnswer()));
performInsert(map,null);
noteModifications(1,0,0);
if (hopcountMethod != IJobDescription.HOPCOUNT_NEVERDELETE)
{
iter = answer.getDeleteDependencies();
while (iter.hasNext())
{
DeleteDependency dep = (DeleteDependency)iter.next();
deleteDepsManager.writeDependency(id,jobID,dep);
}
}
}
/** A class describing a document identifier and a link type, to be used in looking up the appropriate node in
* the hash.
*/
protected static class Question
{
/** Document identifier. */
protected String documentIdentifierHash;
/** Link type. */
protected String linkType;
/** Constructor. */
public Question(String documentIdentifierHash, String linkType)
{
this.documentIdentifierHash = documentIdentifierHash;
this.linkType = linkType;
}
/** Get the document identifier.
*/
public String getDocumentIdentifierHash()
{
return documentIdentifierHash;
}
/** Get the link type.
*/
public String getLinkType()
{
return linkType;
}
/** The 'question' is uniquely described by linktype, document identifier,
* and maximum hop count. However, there is good reason to want to merge answers where possible that have
* the same linktype and document identifier, so that's what we key on for hashing.
*/
public boolean equals(Object o)
{
if (!(o instanceof Question))
return false;
Question dn = (Question)o;
return dn.documentIdentifierHash.equals(documentIdentifierHash) && dn.linkType.equals(linkType);
}
/** Hash must agree with equals, above.
*/
public int hashCode()
{
return documentIdentifierHash.hashCode() + linkType.hashCode();
}
}
/** This class represents an answer - which consists both of an answer value, and also the dependencies
* of that answer (i.e. the add dependencies and delete dependencies).
*/
protected static class Answer
{
/** The answer value */
protected int answer = ANSWER_UNKNOWN;
/** This is the set of delete dependencies. It is keyed by a DeleteDependency object. */
protected HashMap deleteDependencies = new HashMap();
/** Constructor. */
public Answer()
{
}
public Answer(Answer other)
{
answer = other.answer;
// Shallow copy is fine, because the stuff in these dependencies is immutable.
deleteDependencies = (HashMap)other.deleteDependencies.clone();
}
public Answer(int value)
{
answer = value;
}
/** Set an answer from initial data. */
public Answer(int answer, DeleteDependency[] deleteDeps)
{
this.answer = answer;
int i = 0;
while (i < deleteDeps.length)
{
DeleteDependency dep = (DeleteDependency)deleteDeps[i++];
deleteDependencies.put(dep,dep);
}
}
/** Get the current answer value.
*/
public int getAnswer()
{
return answer;
}
/** Get the number of delete dependencies */
public int countDeleteDependencies()
{
return deleteDependencies.size();
}
/** Iterate over the delete dependencies. */
public Iterator getDeleteDependencies()
{
return deleteDependencies.keySet().iterator();
}
/** Check if a delete dependency is present */
public boolean hasDependency(DeleteDependency dep)
{
return deleteDependencies.get(dep) != null;
}
/** Initialize this answer object. This sets the answer value to ANSWER_INFINITY
* and clears the maps.
*/
public void initialize(int value)
{
answer = value;
deleteDependencies.clear();
}
/** Copy the answer value from another answer object */
public void duplicate(Answer other)
{
answer = other.answer;
// Shallow copy is fine, because the stuff in these dependencies is immutable.
deleteDependencies = (HashMap)other.deleteDependencies.clone();
}
/** Update the current answer, using a child link's information and answer.
* This method basically decides if the child is relevant, and if so merges the answer from the
* child together with the current value stored here.
*@param childAnswer is the current answer found for the child.
*@param isIncrementingLink is true if this link is the kind being counted, and thus increments
* the hopcount.
*@param linkType is the type of THIS link (for building appropriate delete dependency).
*@param parentIDHash is the hash of the parent document id for THIS link.
*@param childIDHash is the hash of the child document id for THIS link.
*/
public void merge(Answer childAnswer, boolean isIncrementingLink,
String linkType, String parentIDHash, String childIDHash)
{
// For answers, we obviously pick the best answer we can.
// For dependencies, this is the process:
//
// 1) Delete dependencies
// There can be only one delete dependency resulting from any given link. This
// dependency will only be created if the link is "the best" so far. The child
// node's delete dependencies will also be included whenever a new best match is
// found.
//
//
// Now, get the child answer value.
int childAnswerValue = childAnswer.getAnswer();
// If the link is the same kind as the kind of answer we want, then it adds one
// to the distance measurement to the child.
if (answer >= 0)
{
// Determined distance against whatever the child says.
if (childAnswerValue >= 0)
{
if (isIncrementingLink)
childAnswerValue++;
if (childAnswerValue < answer)
{
// Use the child answer value
setAnswerFromChild(childAnswerValue,childAnswer.deleteDependencies,linkType,parentIDHash,childIDHash);
return;
}
}
// The current answer is better than either infinity or greater-than-max.
return;
}
// If the current answer is infinity, use the child answer.
if (answer == ANSWER_INFINITY)
{
if (childAnswerValue >= 0)
{
if (isIncrementingLink)
childAnswerValue++;
// Use the child answer value
setAnswerFromChild(childAnswerValue,childAnswer.deleteDependencies,linkType,parentIDHash,childIDHash);
return;
}
// Leave the current answer.
return;
}
// For the current answer being "greater than max":
if (childAnswerValue >= 0)
{
if (isIncrementingLink)
childAnswerValue++;
// Use the child answer value
setAnswerFromChild(childAnswerValue,childAnswer.deleteDependencies,linkType,parentIDHash,childIDHash);
return;
}
// All other cases: just keep the current answer.
}
/** Set answer from child */
protected void setAnswerFromChild(int newAnswer, HashMap childDeleteDependencies, String linkType, String parentIDHash, String childIDHash)
{
answer = newAnswer;
deleteDependencies = (HashMap)childDeleteDependencies.clone();
DeleteDependency x = new DeleteDependency(linkType,parentIDHash,childIDHash);
deleteDependencies.put(x,x);
}
/** Set an answer from initial data. */
public void setAnswer(int answer, DeleteDependency[] deleteDeps)
{
this.answer = answer;
deleteDependencies.clear();
int i = 0;
while (i < deleteDeps.length)
{
DeleteDependency dep = (DeleteDependency)deleteDeps[i++];
deleteDependencies.put(dep,dep);
}
}
}
/** This class describes a document reference. */
protected static class DocumentReference
{
protected String childIdentifierHash;
protected String linkType;
/** Constructor */
public DocumentReference(String childIdentifierHash, String linkType)
{
this.childIdentifierHash = childIdentifierHash;
this.linkType = linkType;
}
/** Get the child identifier */
public String getChildIdentifierHash()
{
return childIdentifierHash;
}
/** Get the link type */
public String getLinkType()
{
return linkType;
}
}
/** This class describes a node link reference. */
protected static class NodeReference
{
/** The node being referred to */
protected DocumentNode theNode;
/** The kind of link it is */
protected String linkType;
/** Constructor */
public NodeReference(DocumentNode theNode, String linkType)
{
this.theNode = theNode;
this.linkType = linkType;
}
/** Get the node */
public DocumentNode getNode()
{
return theNode;
}
/** Get the link type */
public String getLinkType()
{
return linkType;
}
/** Hash function. */
public int hashCode()
{
return theNode.hashCode() + linkType.hashCode();
}
/** Is this equal? */
public boolean equals(Object o)
{
if (!(o instanceof NodeReference))
return false;
NodeReference other = (NodeReference)o;
// DocumentNode objects compare only with themselves.
return theNode.equals(other.theNode) && this.linkType.equals(other.linkType);
}
}
/** This class keeps track of the data associated with a node in the hash map.
* This basically includes the following:
* - the document identifier
* - the 'question' that was asked, which has the form (link type, maximum distance)
* - possibly the 'answer' to the question, which is either ">(maximum distance)", or a number.
* - references to the nodes which care about this answer, if they are still queued.
* - summary of the information we've gathered from children so far (if answer not known yet)
* - references to the children of this node that can affect the answer, including link details
* (if answer not known yet)
*/
protected static class DocumentNode
{
/** The question. */
protected Question question;
/** This is the original answer (if any), which is the current value in the database */
protected int databaseAnswerValue = ANSWER_UNKNOWN;
/** The original database row, if any */
protected Long databaseRow = null;
/** The answer, as calculated up to the level of all the completed children, which will
* not include incomplete child references of this node. This is a starting point for every reassessment
* of this node's current answer. It is adjust only when additional children are noted as being complete.
*/
protected Answer startingAnswer = new Answer(ANSWER_UNKNOWN);
/** The current best answer. This takes into account the current status of all the child nodes. If the
* node is not complete, then the answer must be viewed as being less than or equal to this value.
*/
protected Answer trialAnswer = new Answer(ANSWER_UNKNOWN);
/** The best (lowest) possible answer value for this node. This value is calculated based on the known
* child link structure of a node, and can only increase. The value will start low (at 0) and will climb
* as more knowledge is gained, as the children's best possible answer value increases upon re-evaluation.
* When the trial answer (above) reaches a value equal to the best possible value, then the node will be
* immediately marked as "complete", and further processing will be considered unnecessary.
* As far as dependencies are concerned, the bestPossibleAnswer includes dependencies that have gone
* into its assessment. These dependencies represent what would need to be changed to invalidate
* the answer as it stands. (Invalidation means that a smaller best possible answer would be possible, so
* only add dependencies would need consideration.)
*
*/
protected Answer bestPossibleAnswer = new Answer(0);
/** Answer complete flag. Will be set to true only if the value of "trialAnswer" is deemed final. */
protected boolean isComplete = false;
/** This flag is meaningful only if the complete flag is set. */
protected boolean writeNeeded = true;
/** Parent nodes who care (i.e. are still queued). This map contains DocumentNode objects. */
protected Map parentsWhoCare = new HashMap();
/** Child node references. This is a reference to an actual document node object which has a parent reference
* back to this one. If the child node is modified, there is an obligation to cause the parent node to be
* re-evaluated. The re-evaluation process examines all child nodes and may adjust the status of the trial
* answer, and may indeed even remove the reference to the child.
* This map contains NodeReference objects. */
protected Map childReferences = new HashMap();
/** Create a document node. This will happen only if there is no comparable one already in the hash.
*/
public DocumentNode(Question question)
{
this.question = question;
}
/** Get the question. */
public Question getQuestion()
{
return question;
}
/** Reset back to an "unknown" state.
*/
public void reset()
{
isComplete = false;
writeNeeded = true;
databaseAnswerValue = ANSWER_UNKNOWN;
databaseRow = null;
trialAnswer.initialize(ANSWER_UNKNOWN);
startingAnswer.initialize(ANSWER_UNKNOWN);
bestPossibleAnswer.initialize(0);
}
/** Clear child references. */
public void clearChildReferences()
{
childReferences.clear();
}
/** Check if there are children. */
public boolean hasChildren()
{
return childReferences.size() > 0;
}
/** Get an answer that's final.
* Returns "unknown" if the current answer is incomplete.
*/
public int getFinalAnswer()
{
if (isComplete)
{
return trialAnswer.getAnswer();
}
else
return ANSWER_UNKNOWN;
}
/** Check if the answer is complete. Returns true if the answer is complete.
*/
public boolean isAnswerComplete()
{
return isComplete;
}
/** Check if the node is complete, given the question it represents. */
public boolean isComplete()
{
return isComplete;
}
/** Check if a write of the answer is needed to the database */
public boolean isWriteNeeded()
{
return writeNeeded;
}
/** Check if answer is still needed.
*/
public boolean isAnswerNeeded()
{
// Check to make sure there are parents that care.
return parentsWhoCare.size() > 0;
}
/** Get best possible answer */
public Answer getBestPossibleAnswer()
{
return bestPossibleAnswer;
}
/** Set best possible answer */
public void setBestPossibleAnswer(Answer answer)
{
bestPossibleAnswer.duplicate(answer);
}
/** Get the current best answer.
*/
public Answer getTrialAnswer()
{
return trialAnswer;
}
/** Set the answer for this node.
*/
public void setTrialAnswer(Answer answer)
{
this.trialAnswer.duplicate(answer);
}
/** Get the starting (base) answer. */
public Answer getStartingAnswer()
{
return startingAnswer;
}
/** Set the starting (base) answer. */
public void setStartingAnswer(Answer answer)
{
startingAnswer.duplicate(answer);
}
/** Mark the node as being "complete", with a write needed. */
public void makeComplete()
{
if (!isComplete)
{
isComplete = true;
writeNeeded = true;
}
}
/** Mark the answer as being "complete", and not needing a write. */
public void makeCompleteNoWrite()
{
isComplete = true;
writeNeeded = false;
}
/** Add a parent who should be notified if this node's answer changes.
* The parent is responsible for figuring out when this reference should be removed.
*/
public void addParent(DocumentNode parent)
{
parentsWhoCare.put(parent,parent);
}
/** Clear the 'write needed' flag, to prevent another write. */
public void clearWriteNeeded()
{
writeNeeded = false;
}
/** Add a child reference.
*@param childRef is the child node reference to add.
*/
public void addChild(NodeReference childRef)
{
childReferences.put(childRef,childRef);
}
/** Remove a child reference.
*@param childRef is the child node reference to remove.
*/
public void removeChild(NodeReference childRef)
{
childReferences.remove(childRef);
}
/** Remove a parent. This method will get called when the parent's answer no longer can be affected by
* this child's answer (probably because the child's answer has become complete).
*/
public void removeParent(DocumentNode parent)
{
parentsWhoCare.remove(parent);
}
/** Iterate through all current parents. This is an iterator over DocumentNode objects. */
public Iterator getCurrentParents()
{
return parentsWhoCare.keySet().iterator();
}
/** Iterate through current children. This is an iterator over NodeReference objects. */
public Iterator getCurrentChildren()
{
return childReferences.keySet().iterator();
}
/** Set the database row and answer value */
public void setSource(Long rowID, int answerValue)
{
this.databaseRow = rowID;
this.databaseAnswerValue = answerValue;
}
/** Get the database row */
public Long getDatabaseRow()
{
return databaseRow;
}
/** Get the database answer value */
public int getDatabaseValue()
{
return databaseAnswerValue;
}
// Do NOT override hashCode() and equals(), since we want a node to match only itself.
}
/** A queue object allows document nodes to be ordered appropriately for the most efficient execution.
* The queue handles DocumentNode objects exclusively. Mapping of Question to DocumentNode object
* involves structures outside of all queues.
*/
protected static class NodeQueue
{
protected HashMap nodeMap = new HashMap();
/** Constructor.
*/
public NodeQueue()
{
}
/** Queue a document node.
*/
public void addToQueue(DocumentNode node)
{
if (nodeMap.get(node.getQuestion()) == null)
{
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Adding document node "+node.toString()+" to queue "+toString());
nodeMap.put(node.getQuestion(),node);
}
}
/** Remove a node from the queue. This might happen if the node no longer needs evaluation.
*/
public void removeFromQueue(DocumentNode node)
{
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Removing document node "+node.toString()+" from queue "+toString());
nodeMap.remove(node.getQuestion());
}
/** Fetch the next object off the queue for processing. Returns null if there are no more objects.
*/
public DocumentNode nextNode()
{
if (nodeMap.size() == 0)
{
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Retrieving node from queue "+toString()+": none found!");
return null;
}
Question q = (Question)nodeMap.keySet().iterator().next();
DocumentNode dn = (DocumentNode)nodeMap.remove(q);
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Retrieving node "+dn.toString()+" from queue "+toString());
return dn;
}
/** Fetch ALL of the nodes off the queue in one step.
*/
public DocumentNode[] nextNodes()
{
DocumentNode[] rval = new DocumentNode[nodeMap.size()];
Iterator iter = nodeMap.keySet().iterator();
int j = 0;
while (iter.hasNext())
{
Question q = (Question)iter.next();
rval[j++] = (DocumentNode)nodeMap.get(q);
}
nodeMap.clear();
return rval;
}
}
/** The Document Hash structure contains the document nodes we are interested in, including those we need answers
* for to proceed. The main interface involves specifying a set of questions and receiving the answers. This
* structure permits multiple requests to be made to each object, and in-memory caching is used to reduce the amount of database
* activity as much as possible.
* It is also presumed that these requests take place inside of the appropriate transactions, since both read and write
* database activity may well occur.
*/
protected class DocumentHash
{
/** The job identifier */
protected Long jobID;
/** This is the map of known questions to DocumentNode objects. */
protected Map questionLookupMap = new HashMap();
/** This is the queue for nodes that need to be initialized, who need child fetching. */
protected NodeQueue childFetchQueue = new NodeQueue();
/** This is the queue for evaluating nodes. For all of these nodes, the processing
* has begun: all child nodes have been queued, and at least a partial answer is present. Evaluating one
* of these nodes involves potentially updating the node's answer, and when that is done, all listed parents
* will be requeued on this queue.
*/
protected NodeQueue evaluationQueue = new NodeQueue();
/** These are the legal link types for the job */
protected String[] legalLinkTypes;
/** The hopcount method */
protected int hopcountMethod;
/** Constructor */
public DocumentHash(Long jobID, String[] legalLinkTypes, int hopcountMethod)
{
this.jobID = jobID;
this.legalLinkTypes = legalLinkTypes;
this.hopcountMethod = hopcountMethod;
}
/** Throw in some questions, and prepare for the answers. */
public int[] askQuestions(Question[] questions)
throws ManifoldCFException
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Questions asked as follows:");
int i = 0;
while (i < questions.length)
{
Logging.hopcount.debug(" Linktype='"+questions[i].getLinkType()+"' DocumentID='"+questions[i].getDocumentIdentifierHash()+"'");
i++;
}
Logging.hopcount.debug("");
}
// The algorithm is complex, and works as follows. There are two queues - a queue for
// starting off a node's evaluation (called the child fetch queue), and a second queue for
// re-evaluating nodes (called the evaluation queue).
//
// Whenever a node is first examined, and no answer is available, the node is placed on the
// child fetch queue. The activity associated with this queue is to fetch a node's children
// and queue them in turn (if needed). But in any case, the node is initialized with the
// best available answer.
//
// If the answer is complete, the node is not placed in any queues.
// Parent nodes do not need to be notified, because
// they must already be in the evaluation queue, and will be processed in time.
//
// If the answer was incomplete, the node will be placed into the evaluation queue. Nodes in this
// queue are there because some of their children have changed state in a meaningful way since the
// last time a tentative answer was calculated. The processing of nodes from this queue involves
// updating the answer value, deciding whether it is complete or not, and, if so, writing the answer
// to the database. Nodes that are not complete but have not been modified are not placed in a
// queue; they are simply left unqueued. When all processing is complete, these nodes will be
// checked and converted to "completed" states.
int[] answers = new int[questions.length];
DocumentNode[] nodes = queueQuestions(questions);
// Throw these questions into the opennodes structure, unless the answer is already known.
int i = 0;
while (i < nodes.length)
{
// Flag these questions as having a special parent, so they can't be removed.
nodes[i++].addParent(null);
}
// Now, process until we have all the answers we wanted.
while (true)
{
if (Thread.currentThread().isInterrupted())
throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
// Early decision!
// For each question, see if there's a completed answer yet
i = 0;
while (i < questions.length)
{
DocumentNode dn = nodes[i];
int answer = dn.getFinalAnswer();
if (answer == ANSWER_UNKNOWN)
break;
// Found one! Record it, just in case we finish.
answers[i++] = answer;
}
if (i != questions.length)
{
// Evaluation queue has priority. If there's anything waiting on it, process it.
DocumentNode evaluationNode = evaluationQueue.nextNode();
if (evaluationNode != null)
{
// Evaluate!
evaluateNode(evaluationNode);
continue;
}
Logging.hopcount.debug("Found no nodes to evaluate at the moment");
// Nothing left to evaluate. Do the child fetch bit instead.
DocumentNode[] fetchNodes = childFetchQueue.nextNodes();
if (fetchNodes.length > 0)
{
// Fetch children and initialize the node
getNodeChildren(fetchNodes);
continue;
}
Logging.hopcount.debug("Found no children to fetch at the moment");
// Nothing left to do at all.
// Scan the map and convert all non-complete answers to complete ones. They'll
// be left in an incomplete state if there were loops.
Iterator iter = questionLookupMap.values().iterator();
while (iter.hasNext())
{
DocumentNode dn = (DocumentNode)iter.next();
if (!dn.isComplete())
{
makeNodeComplete(dn);
}
}
Logging.hopcount.debug("Made remaining nodes complete");
// Copy out the answer. All nodes are guaranteed to be complete now.
i = 0;
while (i < questions.length)
{
DocumentNode dn = nodes[i];
answers[i++] = dn.getFinalAnswer();
}
Logging.hopcount.debug("Done (copied out the answers)");
}
else
Logging.hopcount.debug("Done (because answers already available)");
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Answers returned as follows:");
i = 0;
while (i < questions.length)
{
Logging.hopcount.debug(" Linktype='"+questions[i].getLinkType()+"' DocumentID='"+questions[i].getDocumentIdentifierHash()+"'"+
" Answer="+Integer.toString(answers[i]));
i++;
}
Logging.hopcount.debug("");
}
return answers;
}
}
/** Evaluate a node from the evaluation queue.
*/
protected void evaluateNode(DocumentNode node)
throws ManifoldCFException
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Evaluating node; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"'"+
" BaseAnswer="+Integer.toString(node.getStartingAnswer().getAnswer())+
" TrialAnswer="+Integer.toString(node.getTrialAnswer().getAnswer()));
}
// The base (or starting) answer should already have been set for this node.
// What we do here is go through all the remaining listed children,
// and evaluate a new trial answer. There are some special cases we want to
// catch:
//
// 1) If an answer goes to zero, then the node is automatically marked "complete".
// All child references are removed.
// 2) Child references should only be kept around if there's a chance they would
// REDUCE the current answer. So, we should keep children that are incomplete.
// Complete children should be factored into the base answer, and discarded.
// 3) If the node is still marked incomplete, AND if there are no parents, then
// simply delete it.
// 4) If the node is now complete, it should be marked as such, and the distance
// from the node to the root should be written into the database. Parent
// should be requeued also.
// 5) If the node is incomplete, and the trial answer has changed, then update the
// trial answer and requeue all parents.
boolean signalParentsNeeded = false;
Answer baseAnswer = new Answer(node.getStartingAnswer());
// THe baseAnswer already includes the current node in its add deps, so I don't have to add it here.
// Make a pass through the children, looking for completed nodes.
// Keep track of the ones we find, so we can remove them from the child list.
ArrayList childRemovalList = new ArrayList();
Iterator iter = node.getCurrentChildren();
while (iter.hasNext())
{
NodeReference childRef = (NodeReference)iter.next();
DocumentNode child = childRef.getNode();
String linkType = childRef.getLinkType();
if (child.isComplete())
{
childRemovalList.add(childRef);
baseAnswer.merge(child.getTrialAnswer(),
linkType.equals(node.getQuestion().getLinkType()),
linkType,
node.getQuestion().getDocumentIdentifierHash(),
child.getQuestion().getDocumentIdentifierHash());
}
}
// Get rid of the marked children.
int i = 0;
while (i < childRemovalList.size())
{
NodeReference childRef = (NodeReference)childRemovalList.get(i++);
childRef.getNode().removeParent(node);
node.removeChild(childRef);
}
// Set new starting answer, if it has changed. This will NOT cause a requeue of parents,
// all by itself.
node.setStartingAnswer(baseAnswer);
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting baseAnswer; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"' baseAnswer="+Integer.toString(baseAnswer.getAnswer()));
}
// Now, go through remaining nodes and build a trial answer.
Answer trialAnswer = new Answer(baseAnswer);
iter = node.getCurrentChildren();
while (iter.hasNext())
{
NodeReference childRef = (NodeReference)iter.next();
DocumentNode child = childRef.getNode();
String linkType = childRef.getLinkType();
trialAnswer.merge(child.getTrialAnswer(),
linkType.equals(node.getQuestion().getLinkType()),
linkType,
node.getQuestion().getDocumentIdentifierHash(),
child.getQuestion().getDocumentIdentifierHash());
}
// Get the current trial answer, so we can compare
Answer currentTrialAnswer = node.getTrialAnswer();
if (trialAnswer.getAnswer() != currentTrialAnswer.getAnswer())
{
signalParentsNeeded = true;
}
// See if we mark this "complete".
if (trialAnswer.getAnswer() == node.getBestPossibleAnswer().getAnswer())
{
// Early exit.
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting complete [bestpossible]; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"' trialAnswer="+Integer.toString(trialAnswer.getAnswer()));
}
node.setTrialAnswer(trialAnswer);
makeNodeComplete(node);
signalParentsNeeded = true;
}
else if (!node.hasChildren())
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting complete [nochildren]; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"' trialAnswer="+Integer.toString(trialAnswer.getAnswer()));
}
// Simply have no more children that aren't complete, so we are done.
node.setTrialAnswer(trialAnswer);
// It's complete!
makeNodeComplete(node);
signalParentsNeeded = true;
}
else
{
// Update the answer.
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting trialAnswer; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"' trialAnswer="+Integer.toString(trialAnswer.getAnswer()));
}
node.setTrialAnswer(trialAnswer);
// Still not complete. If it has no parents, it's not needed anymore, so chuck it.
if (!node.isAnswerNeeded())
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Discarding [unneeded]; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"'");
}
// Take this node out of the main map.
questionLookupMap.remove(node.getQuestion());
// Remove all the child references
removeChildLinks(node);
Logging.hopcount.debug("Done node evaluation");
return;
}
}
if (signalParentsNeeded)
{
Logging.hopcount.debug("Requeueing parent nodes");
// Requeue the parents.
queueParents(node);
}
Logging.hopcount.debug("Done node evaluation");
}
/** Fetch a the children of a bunch of nodes, and initialize all of the nodes appropriately.
*/
protected void getNodeChildren(DocumentNode[] nodes)
throws ManifoldCFException
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Finding children for the following nodes:");
int z = 0;
while (z < nodes.length)
{
DocumentNode node = nodes[z++];
Logging.hopcount.debug(" DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"'");
}
}
// Need to figure out which nodes need processing, and which don't.
// All of the current nodes are (by definition) not in any queues. We need to keep track of
// which queues these nodes have to go into.
// - Some will just be deleted
// - Some will be made complete, and not put into any queue
//
// Naively, we might presume that some will be queued (on the evaluation queue) as a result of being the
// parent of a node that was changed. But, in fact, being on the "child fetch" queue means that we
// DON'T have any loaded child references yet. So - that can't happen, at least not until the child references
// are loaded and the nodes initialized.
//
// The real question therefore is how, exactly, to handle the situation where we load children for a bunch of
// nodes, and initialize the nodes, and then need to put their parents on the evaluation queue. When we did
// only a single node at a time, the parents became queued but no further evaluation took place here.
// Since one of the nodes being processed may in fact refer to another node being processed, the
// 'full' initialization cannot easily be handled here; the nodes must be simply initialized to a basic incomplete
// state, and put on the evaluation queue, for complete evaluation.
// This is a map where I'll put the nodes that I still need children for, so I can get all children at once.
HashMap nodesNeedingChildren = new HashMap();
// From the nodes needing children, come up with a unique set of parent identifiers, so
// we can get the children as efficiently as possible.
HashMap parentMap = new HashMap();
int k = 0;
while (k < nodes.length)
{
DocumentNode node = nodes[k++];
if (!node.isAnswerNeeded())
{
// If there are no parents for this node, then this node is not currently needed, so just ditch it.
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Discard before getting node children[unneeded]; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"'");
}
questionLookupMap.remove(node.getQuestion());
}
else if (node.getQuestion().getDocumentIdentifierHash().length() == 0)
{
// If this is the root, set all node values accordingly.
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Found root; DocID='"+node.getQuestion().getDocumentIdentifierHash()+"' Linktype='"+
node.getQuestion().getLinkType()+"'");
}
node.setStartingAnswer(new Answer(0));
node.setTrialAnswer(new Answer(0));
node.makeCompleteNoWrite();
queueParents(node);
}
nodesNeedingChildren.put(node.getQuestion(),node);
parentMap.put(node.getQuestion().getDocumentIdentifierHash(),node.getQuestion().getDocumentIdentifierHash());
}
// Now, we want all the children of all the nodes that are left - if any
if (nodesNeedingChildren.size() == 0)
return;
// This map will get built as a map keyed by parent document identifier and containing as
// a value an arraylist of DocumentReference objects.
HashMap referenceMap = new HashMap();
int maxClause = maxClauseFindChildren(jobID);
ArrayList list = new ArrayList();
k = 0;
Iterator iter = parentMap.keySet().iterator();
while (iter.hasNext())
{
String parentIDHash = (String)iter.next();
referenceMap.put(parentIDHash,new ArrayList());
if (k == maxClause)
{
findChildren(referenceMap,jobID,list);
k = 0;
list.clear();
}
list.add(parentIDHash);
k++;
}
if (k > 0)
findChildren(referenceMap,jobID,list);
// Go through the 'nodes needing children'. For each node, look up the child references, and create a set
// of questions for all the node children. We'll refer directly to this list when putting together the
// nodes in the last step.
HashMap childQuestionMap = new HashMap();
iter = nodesNeedingChildren.keySet().iterator();
while (iter.hasNext())
{
Question q = (Question)iter.next();
ArrayList childlist = (ArrayList)referenceMap.get(q.getDocumentIdentifierHash());
k = 0;
while (k < childlist.size())
{
DocumentReference dr = (DocumentReference)childlist.get(k++);
Question childQuestion = new Question(dr.getChildIdentifierHash(),q.getLinkType());
childQuestionMap.put(childQuestion,childQuestion);
}
}
// Put together a child question array
Question[] questionsToAsk = new Question[childQuestionMap.size()];
k = 0;
iter = childQuestionMap.keySet().iterator();
while (iter.hasNext())
{
questionsToAsk[k++] = (Question)iter.next();
}
// Ask the questions in batch (getting back nodes that we can then refer to)
DocumentNode[] resultNodes = queueQuestions(questionsToAsk);
// Put the resulting nodes into the map for ease of lookup.
k = 0;
while (k < resultNodes.length)
{
childQuestionMap.put(questionsToAsk[k],resultNodes[k]);
k++;
}
// Now, go through all the nodes that need processing one-by-one, and use the childQuestionMap to find
// the nodes we need, and the referenceMap to find the link details.
iter = nodesNeedingChildren.keySet().iterator();
while (iter.hasNext())
{
Question q = (Question)iter.next();
DocumentNode node = (DocumentNode)nodesNeedingChildren.get(q);
String documentIdentifierHash = q.getDocumentIdentifierHash();
Answer startingAnswer = new Answer(ANSWER_INFINITY);
Answer trialAnswer = new Answer(ANSWER_INFINITY);
int bestPossibleAnswerValue = ANSWER_INFINITY;
ArrayList childReferences = (ArrayList)referenceMap.get(q.getDocumentIdentifierHash());
// Each childReference is a DocumentReference object which will allow the lookup of
// the child node from the childQuestionMap.
k = 0;
while (k < childReferences.size())
{
DocumentReference dr = (DocumentReference)childReferences.get(k++);
String childIdentifierHash = dr.getChildIdentifierHash();
Question lookupQuestion = new Question(childIdentifierHash,q.getLinkType());
DocumentNode childNode = (DocumentNode)childQuestionMap.get(lookupQuestion);
String linkType = dr.getLinkType();
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug(" Child found for DocID='"+documentIdentifierHash+"' Linktype='"+
q.getLinkType()+"'; ID='"+childIdentifierHash+"' linktype='"+linkType+"'");
}
boolean isIncrementing = linkType.equals(node.getQuestion().getLinkType());
int bestPossibleCheckValue = 0;
if (isIncrementing)
{
bestPossibleCheckValue = 1;
}
if (bestPossibleAnswerValue == ANSWER_INFINITY || bestPossibleAnswerValue > bestPossibleCheckValue)
bestPossibleAnswerValue = bestPossibleCheckValue;
// Decide how to tally this - into starting answer (and don't record), or
// record it and scan it later?
// If the node is complete, incorporate it into BOTH the starting answer and the
// trial answer. If incomplete, leave a parent reference around.
Answer childAnswer = childNode.getTrialAnswer();
if (childNode.isComplete())
{
startingAnswer.merge(childAnswer,isIncrementing,
linkType,documentIdentifierHash,childIdentifierHash);
trialAnswer.merge(childAnswer,isIncrementing,
linkType,documentIdentifierHash,childIdentifierHash);
}
else
{
// Add it as a child, and only include these results in the trial answer.
childNode.addParent(node);
node.addChild(new NodeReference(childNode,linkType));
trialAnswer.merge(childAnswer,isIncrementing,
linkType,documentIdentifierHash,childIdentifierHash);
}
}
node.setStartingAnswer(startingAnswer);
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting baseAnswer; DocID='"+documentIdentifierHash+"' Linktype='"+
q.getLinkType()+"' baseAnswer="+Integer.toString(startingAnswer.getAnswer()));
}
// Set up best possible answer
Answer bestPossible = new Answer(bestPossibleAnswerValue);
node.setBestPossibleAnswer(bestPossible);
// If the node has managed to complete itself, just throw it onto the "completed" stack
// See if we mark this "complete".
if (trialAnswer.getAnswer() == bestPossible.getAnswer())
{
// It's complete, but we need to update the trial answer's add deps
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting complete [bestpossible]; DocID='"+documentIdentifierHash+"' Linktype='"+
q.getLinkType()+"' trialAnswer="+Integer.toString(trialAnswer.getAnswer()));
}
node.setTrialAnswer(trialAnswer);
makeNodeComplete(node);
}
else if (!node.hasChildren())
{
// It's complete!
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting complete [nochildren]; DocID='"+documentIdentifierHash+"' Linktype='"+
q.getLinkType()+"' trialAnswer="+Integer.toString(trialAnswer.getAnswer()));
}
node.setTrialAnswer(trialAnswer);
makeNodeComplete(node);
}
else
{
if (Logging.hopcount.isDebugEnabled())
{
Logging.hopcount.debug("Setting trialAnswer; DocID='"+documentIdentifierHash+"' Linktype='"+
q.getLinkType()+"' trialAnswer="+Integer.toString(trialAnswer.getAnswer()));
}
node.setTrialAnswer(trialAnswer);
}
// Notify parents.
queueParents(node);
}
}
/** Get the max clauses.
*/
protected int maxClauseFindChildren(Long jobID)
{
return findConjunctionClauseMax(new ClauseDescription[]{
new UnitaryClause(intrinsicLinkManager.jobIDField,jobID)});
}
/** Get the children of a bunch of nodes.
*/
protected void findChildren(Map referenceMap, Long jobID, ArrayList list)
throws ManifoldCFException
{
ArrayList newList = new ArrayList();
String query = buildConjunctionClause(newList,new ClauseDescription[]{
new UnitaryClause(intrinsicLinkManager.jobIDField,jobID),
new MultiClause(intrinsicLinkManager.parentIDHashField,list)});
// Grab the appropriate rows from the intrinsic link table.
IResultSet set = performQuery("SELECT "+intrinsicLinkManager.childIDHashField+","+intrinsicLinkManager.linkTypeField+","+
intrinsicLinkManager.parentIDHashField+" FROM "+intrinsicLinkManager.getTableName()+" WHERE "+query,newList,null,null);
// What I want to produce from this is a filled-in reference map, where the parentid is the
// key, and the value is an ArrayList of DocumentReference objects.
int i = 0;
while (i < set.getRowCount())
{
IResultRow row = set.getRow(i);
String parentIDHash = (String)row.getValue(intrinsicLinkManager.parentIDHashField);
String childIDHash = (String)row.getValue(intrinsicLinkManager.childIDHashField);
String linkType = (String)row.getValue(intrinsicLinkManager.linkTypeField);
if (linkType == null)
linkType = "";
if (childIDHash == null)
childIDHash = "";
ArrayList children = (ArrayList)referenceMap.get(parentIDHash);
children.add(new DocumentReference(childIDHash,linkType));
i++;
}
}
/** Queue the parents on the evaluation queue. */
protected void queueParents(DocumentNode node)
{
Iterator iter = node.getCurrentParents();
while (iter.hasNext())
{
DocumentNode dn = (DocumentNode)iter.next();
if (dn != null && dn.getTrialAnswer().getAnswer() != ANSWER_UNKNOWN)
{
// This is no longer needed, since it's not ordered anymore.
// evaluationQueue.removeFromQueue(dn);
evaluationQueue.addToQueue(dn);
}
}
}
/** Make a node be complete. This involves writing the node's data to the database,
* if appropriate.
*/
protected void makeNodeComplete(DocumentNode node)
throws ManifoldCFException
{
node.makeComplete();
// Clean up children.
removeChildLinks(node);
if (node.isWriteNeeded())
{
// The answer did not not change, so notification of parents is unnecessary.
// But, we need to write this value to the database now.
writeCachedDistance(jobID,legalLinkTypes,node,hopcountMethod);
node.clearWriteNeeded();
}
}
/** Queue up a set of questions. If the question is completed, nothing is done and the node is
* returned. If the question is queued already, the node may be modified if the question is more specific than what was
* already there. In any case, if the answer isn't ready, null is returned.
*@param questions are the set of questions.
*/
protected DocumentNode[] queueQuestions(Question[] questions)
throws ManifoldCFException
{
DocumentNode[] rval = new DocumentNode[questions.length];
// Map for keeping track of questions that need to check database data.
HashMap requestHash = new HashMap();
int z = 0;
while (z < questions.length)
{
Question q = questions[z++];
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Queuing question: DocID='"+q.getDocumentIdentifierHash()+"' Linktype='"+q.getLinkType()+
"'");
// The first thing to do is locate any existing nodes that correspond to the question,
// and find the ones we need to query the database for.
DocumentNode dn = (DocumentNode)questionLookupMap.get(q);
if (dn != null)
{
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Question exists: DocID='"+q.getDocumentIdentifierHash()+"' Linktype='"+q.getLinkType()+
"'");
// Try to figure out what to do based on the node's status.
// Possible options include:
// 1) Just use the node's complete answer as it stands
// 2) Wait on the node to have a complete answer
if (dn.isAnswerComplete())
{
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Answer complete for: DocID='"+q.getDocumentIdentifierHash()+"' Linktype='"+q.getLinkType()+
"'");
continue;
}
// The answer is incomplete.
if (Logging.hopcount.isDebugEnabled())
Logging.hopcount.debug("Returning incomplete answer: DocID='"+q.getDocumentIdentifierHash()+"' Linktype='"+q.getLinkType()+
"'");
continue;
}
// If it's the root, build a record with zero distance.
if (q.getDocumentIdentifierHash() == null || q.getDocumentIdentifierHash().length() == 0)
{
Logging.hopcount.debug("Creating root document node, with distance 0");
Answer a = new Answer(0);
dn = new DocumentNode(q);
dn.setStartingAnswer(a);
dn.setTrialAnswer(a);
// Leave bestPossibleAnswer alone. It's not used after node is marked complete.
dn.makeCompleteNoWrite();
questionLookupMap.put(q,dn);
continue;
}
// There is no existing node. Put a null value in the slot, and throw the question into a hash
// so we can ask it later (as part of a bulk request).
requestHash.put(q,q);
}
// Query for any cached entries that correspond to questions in the request hash
Question[] unansweredQuestions = new Question[requestHash.size()];
z = 0;
Iterator iter = requestHash.keySet().iterator();
while (iter.hasNext())
{
Question q = (Question)iter.next();
unansweredQuestions[z++] = q;
}
// Look up the cached distances in bulk
DocumentNode[] nodes = readCachedNodes(jobID,unansweredQuestions);
z = 0;
while (z < nodes.length)
{
Question q = unansweredQuestions[z];
DocumentNode dn = nodes[z];
// If the node is not complete, need to queue it.
if (!dn.isComplete())
{
// We don't know the distance, so we need to calculate it.
// Queue the question in the child fetch pool. That pool reads the children and queues them,
// and queues the parent for evaluation.
childFetchQueue.addToQueue(dn);
}
questionLookupMap.put(q,dn);
z++;
}
// Go through the original questions again, and look up the nodes to return.
z = 0;
while (z < questions.length)
{
Question q = questions[z];
rval[z] = (DocumentNode)questionLookupMap.get(q);
z++;
}
return rval;
}
/** Notify parents of a node's change of state. */
protected void notifyParents(DocumentNode node)
{
Iterator iter = node.getCurrentParents();
while (iter.hasNext())
{
DocumentNode dn = (DocumentNode)iter.next();
if (dn.getTrialAnswer().getAnswer() != ANSWER_UNKNOWN)
{
// As long as it's not on the childFetch queue, we put it onto
// the eval queue
evaluationQueue.removeFromQueue(dn);
evaluationQueue.addToQueue(dn);
}
}
}
/** Remove remaining links to children. */
protected void removeChildLinks(DocumentNode dn)
{
Iterator iter = dn.getCurrentChildren();
while (iter.hasNext())
{
NodeReference nr = (NodeReference)iter.next();
// Ditch the parent reference
DocumentNode child = nr.getNode();
child.removeParent(dn);
}
dn.clearChildReferences();
}
}
}