/**
*
*/
package ecologylab.bigsemantics.metadata.builtins;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ecologylab.bigsemantics.actions.SemanticAction;
import ecologylab.bigsemantics.actions.SemanticActionHandler;
import ecologylab.bigsemantics.actions.SemanticsConstants;
import ecologylab.bigsemantics.collecting.DocumentDownloadedEventHandler;
import ecologylab.bigsemantics.collecting.DownloadStatus;
import ecologylab.bigsemantics.collecting.SemanticsDownloadMonitors;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.collecting.SemanticsSite;
import ecologylab.bigsemantics.documentcache.PersistentDocumentCache;
import ecologylab.bigsemantics.documentparsers.DocumentParser;
import ecologylab.bigsemantics.downloadcontrollers.CachedPageDownloadController;
import ecologylab.bigsemantics.downloadcontrollers.DownloadController;
import ecologylab.bigsemantics.html.documentstructure.SemanticInLinks;
import ecologylab.bigsemantics.httpclient.SimplHttpResponse;
import ecologylab.bigsemantics.logging.CachedHtmlStale;
import ecologylab.bigsemantics.logging.CachedMmdStale;
import ecologylab.bigsemantics.logging.ChangeLocation;
import ecologylab.bigsemantics.logging.DocumentLogRecord;
import ecologylab.bigsemantics.logging.PersistenceCacheDocHit;
import ecologylab.bigsemantics.logging.PersistenceCacheHtmlHit;
import ecologylab.bigsemantics.logging.PersistenceCacheMiss;
import ecologylab.bigsemantics.logging.Phase;
import ecologylab.bigsemantics.metametadata.FilterLocation;
import ecologylab.bigsemantics.metametadata.MetaMetadata;
import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField;
import ecologylab.bigsemantics.metametadata.MetaMetadataRepository;
import ecologylab.bigsemantics.model.text.ITermVector;
import ecologylab.bigsemantics.model.text.TermVectorFeature;
import ecologylab.bigsemantics.seeding.SearchResult;
import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.bigsemantics.seeding.SeedDistributor;
import ecologylab.collections.SetElement;
import ecologylab.concurrent.Downloadable;
import ecologylab.generic.Continuation;
import ecologylab.io.DownloadProcessor;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.SIMPLTranslationException;
import ecologylab.serialization.SimplTypesScope;
import ecologylab.serialization.formatenums.StringFormat;
import ecologylab.serialization.library.geom.PointInt;
/**
* New Container object. Mostly just a closure around Document. Used as a candidate and wrapper for
* downloading.
*
* @author andruid
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
public class DocumentClosure extends SetElement
implements TermVectorFeature, Downloadable, SemanticsConstants,
Continuation<DocumentClosure>
{
static Logger logger;
static
{
logger = LoggerFactory.getLogger(DocumentClosure.class);
}
private SemanticsGlobalScope semanticsScope;
/**
* This is tracked mainly for debugging, so we can see what pURL was fed into the meta-metadata
* address resolver machine.
*/
private ParsedURL initialPURL;
private Document document;
private final Object DOCUMENT_LOCK = new Object();
private DownloadStatus downloadStatus = DownloadStatus.UNPROCESSED;
private final Object DOWNLOAD_STATUS_LOCK = new Object();
private DocumentParser documentParser;
private SemanticInLinks semanticInlinks;
private List<Continuation<DocumentClosure>> continuations;
/**
* Keeps state about the search process, if this is encapsulates a search result;
*/
private SearchResult searchResult;
private PointInt dndPoint;
/**
* If true (the normal case), then any MediaElements encountered will be added to the candidates
* collection, for possible inclusion in the visual information space.
*/
private boolean collectMedia = true;
/**
* If true (the normal case), then hyperlinks encounted will be fed to the web crawler, providing
* that they are traversable() and of the right mime types.
*/
private boolean crawlLinks = true;
private final Object DOWNLOAD_LOCK = new Object();
/**
* @throws IllegalAccessException
* @throws InstantiationException
* @throws ClassNotFoundException
*/
private DocumentClosure(Document document,
SemanticsGlobalScope semanticsSessionScope,
SemanticInLinks semanticInlinks)
{
super();
this.semanticsScope = semanticsSessionScope;
this.initialPURL = document.getLocation();
this.document = document;
this.semanticInlinks = semanticInlinks;
this.continuations = new ArrayList<Continuation<DocumentClosure>>();
}
/**
* Should only be called by Document.getOrCreateClosure().
*
* @param document
* @param semanticInlinks
*/
DocumentClosure(Document document, SemanticInLinks semanticInlinks)
{
this(document, document.getSemanticsScope(), semanticInlinks);
}
/**
* @return the infoCollector
*/
public SemanticsGlobalScope getSemanticsScope()
{
return semanticsScope;
}
public ParsedURL getInitialPURL()
{
return initialPURL;
}
/**
* @return the document
*/
public Document getDocument()
{
synchronized (DOCUMENT_LOCK)
{
return document;
}
}
public DocumentParser getDocumentParser()
{
return documentParser;
}
/**
* @param presetDocumentParser
* the presetDocumentParser to set
*/
public void setDocumentParser(DocumentParser presetDocumentParser)
{
this.documentParser = presetDocumentParser;
}
@Override
public SemanticsSite getSite()
{
Document document = this.document;
return (document == null) ? null : document.getSite();
}
@Override
public SemanticsSite getDownloadSite()
{
Document document = this.document;
if (document != null)
{
if (document.getDownloadLocation().isFile())
return null;
}
return (document == null) ? null : document.getSite();
}
public boolean isFromSite(SemanticsSite site)
{
return site != null && site == getSite();
}
@Override
public ParsedURL location()
{
Document document = this.document;
return (document == null) ? null : document.getLocation();
}
@Override
public ParsedURL getDownloadLocation()
{
Document document = this.document;
return (document == null) ? null : document.getDownloadLocation();
}
/**
* @return the semanticInlinks
*/
public SemanticInLinks getSemanticInlinks()
{
return semanticInlinks;
}
/**
* Keeps state about the search process, if this Container is a search result;
*/
public SearchResult searchResult()
{
return searchResult;
}
/**
*
* @param resultDistributer
* @param searchNum
* Index into the total number of (seeding) searches specified and being aggregated.
* @param resultNum
* Result number among those returned by google.
*/
public void setSearchResult(SeedDistributor resultDistributer, int resultNum)
{
searchResult = new SearchResult(resultDistributer, resultNum);
}
public SeedDistributor resultDistributer()
{
return (searchResult == null) ? null : searchResult.resultDistributer();
}
@Override
public DocumentLogRecord getLogRecord()
{
return document.logRecord();
}
@Override
public boolean isImage()
{
return document.isImage();
}
public boolean isSeed()
{
return (document != null) && document.isSeed();
}
public Seed getSeed()
{
return document != null ? document.getSeed() : null;
}
public boolean isDnd()
{
return dndPoint != null;
}
public PointInt getDndPoint()
{
return dndPoint;
}
public void setDndPoint(PointInt dndPoint)
{
this.dndPoint = dndPoint;
}
/**
* This method is called before we actually hit the website. Thus, it uses the initial URL to test
* if we need to hit the website. If it returns true, we definitely don't need to hit the website;
* if it returns false, we need to hit the website, but the actual document might have been cached
* using another URL.
*/
@Override
public boolean isCached()
{
return false;
}
/**
* @return the downloadStatus
*/
public DownloadStatus getDownloadStatus()
{
synchronized (DOWNLOAD_STATUS_LOCK)
{
return downloadStatus;
}
}
public boolean isUnprocessed()
{
return getDownloadStatus() == DownloadStatus.UNPROCESSED;
}
/**
* Test state variable inside of QUEUE_DOWNLOAD_LOCK.
*
* @return true if result has already been queued, connected to, downloaded, ... so it should not
* be operated on further.
*/
public boolean downloadHasBeenQueued()
{
return getDownloadStatus() != DownloadStatus.UNPROCESSED;
}
/**
* Test and set state variable inside of QUEUE_DOWNLOAD_LOCK.
*
* @return true if this really queues the download, and false if it had already been queued.
*/
private boolean testAndSetQueueDownload()
{
synchronized (DOWNLOAD_STATUS_LOCK)
{
if (downloadStatus != DownloadStatus.UNPROCESSED)
return false;
setDownloadStatusInternal(DownloadStatus.QUEUED);
return true;
}
}
private void setDownloadStatus(DownloadStatus newStatus)
{
synchronized (DOWNLOAD_STATUS_LOCK)
{
setDownloadStatusInternal(newStatus);
}
}
/**
* (this method does not lock DOWNLOAD_STATUS_LOCK!)
*
* @param newStatus
*/
private void setDownloadStatusInternal(DownloadStatus newStatus)
{
this.downloadStatus = newStatus;
if (this.document != null)
{
document.setDownloadStatus(newStatus);
}
}
public DownloadProcessor<DocumentClosure> downloadMonitor()
{
SemanticsDownloadMonitors downloadMonitors = semanticsScope.getDownloadMonitors();
return downloadMonitors.downloadProcessor(document.isImage(),
isDnd(),
isSeed(),
document.isGui());
}
/**
* Download if necessary, using the {@link ecologylab.concurrent.DownloadMonitor DownloadMonitor}
* if USE_DOWNLOAD_MONITOR is set (it seems it always is), or in a new thread. Control will be
* passed to {@link #downloadAndParse() downloadAndParse()}. Does nothing if this has been
* previously queued, if it has been recycled, or if it isMuted().
*
* @return true if this is actually queued for download. false if it was previously, if its been
* recycled, or if it is muted.
*/
public boolean queueDownload()
{
if (recycled())
{
debugA("ERROR: cant queue download cause already recycled.");
return false;
}
if (this.getDownloadLocation() == null)
return false;
final boolean result = !filteredOut(); // for dashboard type on the fly filtering
if (result)
{
if (!testAndSetQueueDownload())
return false;
delete(); // remove from candidate pools! (invokes deleteHook as well)
downloadMonitor().download(this, continuations == null ? null : this);
}
return result;
}
/**
* In use cases such as the service, we want to be able to call performDownload() synchronously,
* and in the same time make sure that the same closure will be downloaded by one thread at a
* time. This method uses a lock to implement this.
* @param noCacheRead
* @param noCacheWrite
*
* @throws IOException
*/
public DownloadStatus performDownloadSynchronously(boolean noCacheRead, boolean noCacheWrite)
throws IOException
{
synchronized (DOWNLOAD_LOCK)
{
performDownload(noCacheRead, noCacheWrite);
return downloadStatus;
}
}
/**
* Connect to the information resource. Figure out the appropriate MetaMetadata and DocumentType.
* Download the information resource and parse it. Do cleanup afterwards.
*
* This method is typically called by DownloadMonitor.
*
* @throws IOException
*/
@Override
public void performDownload() throws IOException
{
performDownload(false, false);
}
public void performDownload(boolean noCacheRead, boolean noCacheWrite) throws IOException
{
MetaMetadata metaMetadata = (MetaMetadata) document.getMetaMetadata();
if (metaMetadata.isNoCache())
{
noCacheRead = true;
noCacheWrite = true;
}
synchronized (DOWNLOAD_STATUS_LOCK)
{
logger.info("Entering performDownload(), downloadStatus = " + downloadStatus + " skipCache=" + noCacheRead);
if (noCacheRead)
{
switch (downloadStatus)
{
case CONNECTING:
case PARSING:
return;
default:
break;
}
}
else
{
if (recycled() || document.isRecycled())
{
logger.error("Recycled document closure in performDownload(): " + document);
return;
}
switch (downloadStatus)
{
case CONNECTING:
case PARSING:
case DOWNLOAD_DONE:
case IOERROR:
case RECYCLED:
return;
default:
break;
}
}
logger.info("Changing status from " + downloadStatus + " to connecting: " + this);
setDownloadStatusInternal(DownloadStatus.CONNECTING);
}
ParsedURL location = location();
DocumentLogRecord logRecord = getLogRecord();
PersistentDocumentCache pCache = semanticsScope.getPersistentDocumentCache();
logRecord.beginPhase(Phase.DOWNLOAD_AND_PARSE);
// Check the persistent cache first
PersistenceMetaInfo cacheMetaInfo = null;
String cachedRawContent = null;
Document cachedDoc = null;
if (pCache != null && !noCacheRead)
{
logRecord.beginPhase(Phase.PCACHE_READ);
try
{
cacheMetaInfo = pCache.getMetaInfo(location);
if (cacheMetaInfo != null)
{
logRecord.setPersistenceMetaInfo(cacheMetaInfo);
// check if cached raw content is too old.
Date accessTime = cacheMetaInfo.getAccessTime();
Date currentTime = new Date();
long diff = currentTime.getTime() - accessTime.getTime();
long cacheLifeMs = metaMetadata.getCacheLifeMs();
if (diff <= cacheLifeMs)
{
// it's not too old, we should use the cached raw content.
cachedRawContent = pCache.retrieveRawContent(cacheMetaInfo);
logRecord.logPost().addEventNow(new PersistenceCacheHtmlHit());
// check if cached document needs to be re-extracted
String currentHash = metaMetadata.getHashForExtraction();
if (currentHash.equals(cacheMetaInfo.getMmdHash()))
{
cachedDoc = pCache.retrieveDoc(cacheMetaInfo);
logRecord.logPost().addEventNow(new PersistenceCacheDocHit());
}
else
{
logRecord.logPost().addEventNow(new CachedMmdStale());
}
}
else
{
logRecord.logPost().addEventNow(new CachedHtmlStale());
}
}
else
{
logRecord.logPost().addEventNow(new PersistenceCacheMiss());
}
}
catch (Exception e)
{
String errMsg = "Error accessing persistence cache.";
logger.error(errMsg, e);
logRecord.addErrorRecord(errMsg, e);
}
logRecord.endPhase(Phase.PCACHE_READ);
}
// If not in the persistent cache, download the raw page and parse
if (cachedDoc != null)
{
semanticsScope.getLocalDocumentCollection().remap(document, cachedDoc);
changeDocument(cachedDoc);
}
else
{
DownloadController downloadController = null;
boolean rawContentDownloaded = false;
if (cachedRawContent != null)
{
downloadController =
new CachedPageDownloadController(cacheMetaInfo.getLocation(),
cacheMetaInfo.getRawAdditionalLocations(),
cacheMetaInfo.getCharset(),
cacheMetaInfo.getMimeType(),
200,
"OK",
cachedRawContent);
}
else
{
downloadController = downloadRawPage(location);
rawContentDownloaded = true;
}
if (downloadController.isGood())
{
handleRedirections(downloadController, location);
metaMetadata =
changeMetaMetadataIfNeeded(downloadController.getHttpResponse().getMimeType());
findParser(metaMetadata, downloadController);
if (documentParser != null)
{
doParse(metaMetadata);
if (pCache != null && !noCacheWrite)
{
doPersist(pCache, downloadController, document, rawContentDownloaded);
}
documentParser = null;
}
}
else
{
logger.error("Network connection error: " + document);
setDownloadStatus(DownloadStatus.IOERROR);
logRecord.endPhase(Phase.DOWNLOAD_AND_PARSE);
return;
}
downloadController.recycle();
}
document.downloadAndParseDone(documentParser);
logRecord.endPhase(Phase.DOWNLOAD_AND_PARSE);
setDownloadStatus(DownloadStatus.DOWNLOAD_DONE);
}
private DownloadController downloadRawPage(ParsedURL location) throws IOException
{
getLogRecord().beginPhase(Phase.DOWNLOAD);
String userAgent = document.getMetaMetadata().getUserAgentString();
DownloadController downloadController = semanticsScope.createDownloadController(this);
downloadController.setUserAgent(userAgent);
if (downloadController.accessAndDownload(location))
{
SimplHttpResponse httpResp = downloadController.getHttpResponse();
getLogRecord().setDownloadStatusCode(httpResp.getCode());
}
getLogRecord().endPhase(Phase.DOWNLOAD);
return downloadController;
}
private void handleRedirections(DownloadController downloadController, ParsedURL location)
{
String newUrl = downloadController.getHttpResponse().getUrl();
ParsedURL newPurl = ParsedURL.getAbsolute(newUrl);
Document newDoc = semanticsScope.getOrConstructDocument(newPurl);
changeDocument(newDoc);
MetaMetadataCompositeField mmd = newDoc.getMetaMetadata();
if (mmd instanceof MetaMetadata)
{
FilterLocation filter = ((MetaMetadata) mmd).getFilterLocation();
if (filter != null)
{
ArrayList<ParsedURL> altLocs = new ArrayList<ParsedURL>();
try
{
newPurl = filter.filter(newPurl, altLocs);
if (newPurl != null && !newPurl.equals(document.getLocation()))
{
document.setLocation(newPurl);
}
for (ParsedURL altLoc : altLocs)
{
document.addAdditionalLocation(altLoc);
semanticsScope.getLocalDocumentCollection().addMapping(altLoc, document);
}
}
catch (Exception e)
{
logger.error("Exception filtering location " + newPurl, e);
}
}
}
// handle other locations:
List<ParsedURL> otherLocations = downloadController.getHttpResponse().getOtherPurls();
if (otherLocations != null)
{
for (ParsedURL otherLocation : otherLocations)
{
if (otherLocation != null)
{
document.addAdditionalLocation(otherLocation);
semanticsScope.getLocalDocumentCollection().addMapping(otherLocation, document);
}
}
}
}
private MetaMetadata changeMetaMetadataIfNeeded(String mimeType)
{
MetaMetadata metaMetadata = (MetaMetadata) document.getMetaMetadata();
// check for more specific meta-metadata
if (metaMetadata.isGenericMetadata())
{ // see if we can find more specifc meta-metadata using mimeType
MetaMetadataRepository repository = semanticsScope.getMetaMetadataRepository();
MetaMetadata mimeMmd = repository.getMMByMime(mimeType);
if (mimeMmd != null && !mimeMmd.equals(metaMetadata))
{
// new meta-metadata!
if (!mimeMmd.getMetadataClass().isAssignableFrom(document.getClass()))
{
// more specific so we need new metadata!
Document document = (Document) mimeMmd.constructMetadata(); // set temporary on stack
changeDocument(document);
}
metaMetadata = mimeMmd;
document.setMetaMetadata(mimeMmd);
}
}
return metaMetadata;
}
private void findParser(MetaMetadata metaMetadata, DownloadController downloadController)
{
if (documentParser == null)
{
boolean noParser = false;
// // First check if registered no parser
// noParser = DocumentParser.isRegisteredNoParser(document.getLocation());
// List<MetadataParsedURL> additionalLocations = document.getAdditionalLocations();
// if (additionalLocations != null)
// {
// for (int i = 0; i < additionalLocations.size() && !noParser; ++i)
// {
// noParser |= DocumentParser.isRegisteredNoParser(additionalLocations.get(i).getValue());
// }
// }
if (noParser)
{
logger.warn("Registered no parser: " + document);
}
else
{
// If not registered no parser, try to find one
documentParser =
DocumentParser.getByMmd(metaMetadata, semanticsScope, this, downloadController);
if (documentParser == null)
{
logger.warn("No parser found: " + metaMetadata);
}
}
}
}
private void doParse(MetaMetadata metaMetadata) throws IOException
{
getLogRecord().beginPhase(Phase.EXTRACT);
// container or not (it could turn out to be an image or some other mime type), parse the baby!
setDownloadStatus(DownloadStatus.PARSING);
takeSemanticActions(metaMetadata, metaMetadata.getBeforeSemanticActions());
documentParser.parse();
takeSemanticActions(metaMetadata, metaMetadata.getAfterSemanticActions());
addDocGraphCallbacksIfNeeded();
getLogRecord().endPhase(Phase.EXTRACT);
}
private void takeSemanticActions(MetaMetadata metaMetadata, ArrayList<SemanticAction> actions)
{
if (metaMetadata != null && actions != null)
{
SemanticActionHandler handler = new SemanticActionHandler(semanticsScope, documentParser);
handler.takeSemanticActions(metaMetadata, document, actions);
}
}
private void addDocGraphCallbacksIfNeeded()
{
if (this.getSemanticsScope().ifAutoUpdateDocRefs())
{
// add callbacks so that when this document is downloaded and parsed, references to it will
// be updated automatically.
Set<DocumentDownloadedEventHandler> listeners =
semanticsScope.getDocumentDownloadingMonitor().getListenersForDocument(document);
if (listeners != null && listeners.size() > 0)
{
addContinuations(listeners);
}
}
}
private void doPersist(PersistentDocumentCache pCache,
DownloadController downloadController,
Document doc,
boolean rawContentDownloaded)
throws IOException
{
getLogRecord().beginPhase(Phase.PCACHE_WRITE);
try
{
if (rawContentDownloaded)
{
PersistenceMetaInfo metaInfo =
pCache.store(doc,
downloadController.getHttpResponse().getContent(),
downloadController.getHttpResponse().getCharset(),
downloadController.getHttpResponse().getMimeType(),
doc.getMetaMetadata().getHashForExtraction());
getLogRecord().setId(metaInfo.getDocId());
getLogRecord().setPersistenceMetaInfo(metaInfo);
}
else
{
PersistenceMetaInfo metaInfo = pCache.getMetaInfo(doc.getLocation());
pCache.updateDoc(metaInfo, doc);
}
}
catch (Exception e)
{
String errMsg = "Error storing to persistence cache.";
logger.error(errMsg, e);
getLogRecord().addErrorRecord(errMsg, e);
}
getLogRecord().endPhase(Phase.PCACHE_WRITE);
}
/**
* Dispatch all of our registered callbacks.
*/
@Override
public void callback(DocumentClosure o)
{
if (continuations == null)
return;
List<Continuation<DocumentClosure>> currentContinuations;
synchronized (continuations)
{
currentContinuations = new ArrayList<Continuation<DocumentClosure>>(continuations);
}
if (currentContinuations != null)
{
for (Continuation<DocumentClosure> continuation : currentContinuations)
{
try
{
continuation.callback(o);
}
catch (Exception e)
{
logger.error("Error calling back: " + o + ": " + continuation, e);
}
}
}
// wait to recycle continuations until after they have been called.
if (isRecycled())
{
continuations.clear();
continuations = null;
}
}
public List<Continuation<DocumentClosure>> getContinuations()
{
return continuations;
}
private List<Continuation<DocumentClosure>> continuations()
{
return continuations;
}
public void addContinuation(Continuation<DocumentClosure> continuation)
{
synchronized (continuations)
{
continuations().add(continuation);
}
}
public void addContinuations(Collection<? extends Continuation<DocumentClosure>> incomingContinuations)
{
synchronized (continuations)
{
List<Continuation<DocumentClosure>> continuations = continuations();
for (Continuation<DocumentClosure> continuation : incomingContinuations)
continuations.add(continuation);
}
}
public void addContinuationBefore(Continuation<DocumentClosure> continuation)
{
synchronized (continuations)
{
continuations().add(0, continuation);
}
}
/**
* Add a continuation to this closure before it is downloaded (i.e. before its performDownload()
* method finishes).
*
* This gives the client the possibility of making sure the continuation will be called when the
* closure finishes downloading.
*
* @param continuation
* @return true if the continuation is added before the closure finishes downloading; false if the
* closure is already downloaded.
*/
public boolean addContinuationBeforeDownloadDone(Continuation<DocumentClosure> continuation)
{
if (downloadStatus != DownloadStatus.DOWNLOAD_DONE
&& downloadStatus != DownloadStatus.IOERROR
&& downloadStatus != DownloadStatus.RECYCLED)
{
synchronized (DOWNLOAD_STATUS_LOCK)
{
if (downloadStatus != DownloadStatus.DOWNLOAD_DONE
&& downloadStatus != DownloadStatus.IOERROR
&& downloadStatus != DownloadStatus.RECYCLED)
{
addContinuation(continuation);
return true;
}
}
}
return false;
}
/**
* Document metadata object must change, because we learned something new about its type.
*
* @param newDocument
*/
public void changeDocument(Document newDocument)
{
synchronized (DOCUMENT_LOCK)
{
if (newDocument != document)
{
Document oldDocument = document;
document = newDocument;
logger.info("Changing {} to {}", oldDocument, newDocument);
SemanticsSite oldSite = oldDocument.site();
SemanticsSite newSite = newDocument.site();
if (oldSite != null && oldSite != newSite)
{
// calling changeDocument() because of redirecting?
if (oldSite.isDownloading())
oldSite.endDownload(oldDocument.getDownloadLocation());
}
newDocument.inheritValues(oldDocument);
semanticInlinks = newDocument.getSemanticInlinks(); // probably not needed, but just in
// case.
newDocument.setLogRecord(oldDocument.getLogRecord());
ParsedURL oldLoc = oldDocument.getLocation();
ParsedURL newLoc = newDocument.getLocation();
if (oldLoc != null && !oldLoc.equals(newLoc))
{
ChangeLocation changeLocationEvent = new ChangeLocation(oldLoc, newLoc);
getLogRecord().logPost().addEventNow(changeLocationEvent);
}
oldDocument.recycle();
}
}
}
/**
* Close the current connection. Re-open a connection to the same location. Use the same Document
* object; don't process re-directs, or anything like that. Re-connect simply.
*
* @return PURLConnection for the new connection.
* @throws IOException
*/
public DownloadController reConnect() throws IOException
{
DownloadController downloadController = semanticsScope.createDownloadController(this);
downloadController.accessAndDownload(document.getLocation());
return downloadController;
}
@Override
public void recycle()
{
recycle(false);
}
@Override
public synchronized void recycle(boolean recycleDocument)
{
synchronized (DOWNLOAD_STATUS_LOCK)
{
if (downloadStatus == DownloadStatus.RECYCLED)
return;
setDownloadStatusInternal(DownloadStatus.RECYCLED);
}
if (documentParser != null)
documentParser.recycle();
semanticInlinks = null;
initialPURL = null;
// ??? should we recycle Document here -- under what circumstances???
if (recycleDocument)
document.recycle();
}
@Override
public boolean recycled()
{
Document document = this.document;
return document == null || document.isRecycled();
}
@Override
public boolean isRecycled()
{
return document == null || document.isRecycled();
}
/**
* Resets this closure as if it is newly created.
*/
public void reset()
{
setDownloadStatus(DownloadStatus.UNPROCESSED);
if (document != null)
{
document.resetRecycleStatus();
}
}
@Override
public String toString()
{
return super.toString() + "[" + document.getLocation() + "]";
}
@Override
public int hashCode()
{
return (document == null) ? -1 : document.hashCode();
}
@Override
public ITermVector termVector()
{
return (document == null) ? null : document.termVector();
}
/**
* Called by DownloadMonitor in case a timeout happens.
*/
@Override
public void handleIoError(Throwable e)
{
setDownloadStatus(DownloadStatus.IOERROR);
if (documentParser != null)
{
documentParser.handleIoError(e);
}
recycle();
}
@Override
public String message()
{
return document == null ? "recycled" : document.getLocation().toString();
}
public void serialize(OutputStream stream)
{
serialize(stream, StringFormat.XML);
}
public void serialize(OutputStream stream, StringFormat format)
{
Document document = getDocument();
try
{
SimplTypesScope.serialize(document, System.out, format);
System.out.println("\n");
}
catch (SIMPLTranslationException e)
{
error("Could not serialize " + document);
e.printStackTrace();
}
}
public void serialize(StringBuilder buffy)
{
Document document = getDocument();
try
{
SimplTypesScope.serialize(document, buffy, StringFormat.XML);
System.out.println("\n");
}
catch (SIMPLTranslationException e)
{
error("Could not serialize " + document);
e.printStackTrace();
}
}
}