/**
*
*/
package ecologylab.bigsemantics.documentparsers;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.HashSet;
import java.util.List;
import ecologylab.bigsemantics.actions.SemanticsConstants;
import ecologylab.bigsemantics.collecting.SemanticsGlobalScope;
import ecologylab.bigsemantics.downloadcontrollers.DownloadController;
import ecologylab.bigsemantics.logging.DocumentLogRecord;
import ecologylab.bigsemantics.metadata.Metadata;
import ecologylab.bigsemantics.metadata.builtins.Document;
import ecologylab.bigsemantics.metadata.builtins.DocumentClosure;
import ecologylab.bigsemantics.metametadata.MetaMetadata;
import ecologylab.bigsemantics.metametadata.MetaMetadataCompositeField;
import ecologylab.bigsemantics.metametadata.MetaMetadataRepository;
import ecologylab.bigsemantics.seeding.Seed;
import ecologylab.collections.Scope;
import ecologylab.generic.Debug;
import ecologylab.generic.ReflectionTools;
import ecologylab.net.ParsedURL;
/**
* Super class for all document parser types. This class obtains the connection to a document. A
* parse method may be present to process the document.
* <p/>
* Their role is to translate a Document into some kind of metadata and semantic actions. They start
* with a PURL, which the static connect() method translates into an PURLConnection, and an
* appropriate instance of a subclass. The {@link #parse() parse()} method is then called to
* translate the document into the semantic model, using the PURLConnection.
* <p/>
* The translation from a PURL to an appropriate subclass instance is performed using a combination
* of the PURLs extension, if it has a useful one, and then the mime-type returned by the
* URLConnection response header. These keys are then used to perform a lookup in one of the
* registries maintained in this class.
*
* @author andruid
* @author eunyee
*/
@SuppressWarnings(
{ "rawtypes", "unchecked" })
abstract public class DocumentParser<D extends Document> extends Debug
{
static final Scope<Class<? extends DocumentParser>> bindingParserMap = new Scope<Class<? extends DocumentParser>>();
static final Scope<Class<? extends DocumentParser>> registryByMimeType = new Scope<Class<? extends DocumentParser>>();
static final Scope<Class<? extends DocumentParser>> registryBySuffix = new Scope<Class<? extends DocumentParser>>();
static final Scope<Class<? extends DocumentParser>> registryByClassName = new Scope<Class<? extends DocumentParser>>();
static final HashSet<String> NO_PARSER_SUFFIX_MAP = new HashSet<String>();
static
{
register(SemanticsConstants.DIRECT_BINDING_PARSER, DirectBindingParser.class);
register(SemanticsConstants.XPATH_PARSER, XPathParser.class);
register(SemanticsConstants.FEED_PARSER, FeedParser.class);
register(SemanticsConstants.HTML_IMAGE_DOM_TEXT_PARSER,
HTMLDOMImageTextParser.class);
register(SemanticsConstants.PDF_PARSER, PdfParser.class);
}
/**
* Allow clients to register or re-register parsers by name.
*
* This can be useful when a client needs to provide an alternative implementation for an parser.
* For example, the semantics service may need to use a different algorithm for HTML image text
* clipping derivation in another parser.
*
* @param binding
* @param parserClass
*/
public static void register(String binding, Class<? extends DocumentParser> parserClass)
{
bindingParserMap.put(binding, parserClass);
}
/**
* Get registered parser by binding.
*
* @param binding
* @param semanticsScope
* @param documentClosure
* @param downloadController
* @return
*/
public static DocumentParser getByBinding(String binding,
SemanticsGlobalScope semanticsScope,
DocumentClosure documentClosure,
DownloadController downloadController)
{
DocumentParser result = null;
if (binding != null)
{
Class<? extends DocumentParser> documentTypeClass =
(Class<? extends DocumentParser>) bindingParserMap.get(binding);
if (documentTypeClass != null)
{
Object[] constructorArgs = new Object[1];
constructorArgs[0] = semanticsScope;
result = ReflectionTools.getInstance(documentTypeClass);
result.setSemanticsScope(semanticsScope);
result.setDocumentClosure(documentClosure);
result.setDownloadController(downloadController);
}
}
return result;
}
/**
* Get registered parser by meta-metadata.
*
* @param mmd
* @param semanticsScope
* @param documentClosure
* @param downloadController
* @return
*/
public static DocumentParser getByMmd(MetaMetadata mmd,
SemanticsGlobalScope semanticsScope,
DocumentClosure documentClosure,
DownloadController downloadController)
{
String binding = mmd.getParser();
if (binding != null)
{
return DocumentParser.getByBinding(binding,
semanticsScope,
documentClosure,
downloadController);
}
return null;
}
/**
* Record the input PURL as not assocaited with any parser.
*
* @param purl
* @return true if purl is already recorded.
*/
public static boolean isRegisteredNoParser(ParsedURL purl)
{
boolean result = false;
if (purl != null)
{
String suffix = purl.suffix();
if (suffix != null && suffix.length() > 0)
{
result = NO_PARSER_SUFFIX_MAP.contains(suffix);
if (!result)
NO_PARSER_SUFFIX_MAP.add(suffix);
}
}
return result;
}
private SemanticsGlobalScope semanticsScope;
private DownloadController downloadController;
private DocumentClosure documentClosure;
/**
* Default constructor.
*/
protected DocumentParser()
{
super();
}
public SemanticsGlobalScope getSemanticsScope()
{
return semanticsScope;
}
protected void setSemanticsScope(SemanticsGlobalScope semanticsScope)
{
this.semanticsScope = semanticsScope;
}
public DocumentClosure getDocumentClosure()
{
return documentClosure;
}
protected void setDocumentClosure(DocumentClosure documentClosure)
{
this.documentClosure = documentClosure;
}
public DownloadController getDownloadController()
{
return downloadController;
}
protected void setDownloadController(DownloadController downloadController)
{
this.downloadController = downloadController;
}
/**
* @return The document.
*/
public D getDocument()
{
return (D) documentClosure.getDocument();
}
/**
* Subclasses can implement this for looking up downloaded document.
*
* @param metadata
* @return
*/
protected Document lookupDownloadedDocument(Metadata metadata)
{
return null;
}
/**
* Subclasses can implement this for looking up true meta-metadata and returning the right
* metadata object.
*
* @param repository
* @param thisMetadata
* @return
*/
protected Metadata lookupTrueMetaMetadata(MetaMetadataRepository repository, Metadata thisMetadata)
{
return null;
}
/**
* @return Meta-metadata of the corresponding document.
*/
public MetaMetadataCompositeField getMetaMetadata()
{
Document document = documentClosure.getDocument();
return document == null ? null : document.getMetaMetadata();
}
/**
* Connects to a SeedDistributor, when appropriate.
*
* @return null always for the default base class.
*/
public Seed getSeed()
{
return null;
}
/**
* @return A LogRecord that can be used to log events through the lifecycle of a Document.
*/
public DocumentLogRecord getLogRecord()
{
return documentClosure.getLogRecord();
}
/**
* Parse the document.
*
* @throws IOException
*/
public abstract void parse() throws IOException;
/**
* @return The ParsedURL value of the current document. If you want to get a URL value, you can
* use url() method in ParsedURL.
*/
public ParsedURL purl()
{
if (documentClosure == null)
return null;
Document document = documentClosure.getDocument();
if (document == null)
return null;
ParsedURL docPurl = document.getLocation();
if (downloadController != null)
{
ParsedURL connPurl = downloadController.getOriginalLocation();
if (connPurl == null)
{
List<ParsedURL> redirects = downloadController.getHttpResponse().getOtherPurls();
if (redirects != null && redirects.size() > 0)
{
connPurl = redirects.get(0);
}
}
if (docPurl == null || docPurl.isFile()
|| (connPurl != null && !connPurl.isFile()) || !semanticsScope.isService())
{
return connPurl;
}
}
return docPurl;
}
/**
* The document's PURL, without considering connection PURLs.
*
* @return
*/
public ParsedURL getTruePURL()
{
Document document = documentClosure.getDocument();
return document == null ? null : document.getLocation();
}
/**
* Use the DocumentClosure to close the current connection. Re-open a connection to the same
* location. Use the same Document object; don't process re-directs, or anything like that.
* Re-connect simply. Reset the purlConnection field of this to the new PURLConnection.
*
* @return InputStream for the new connection.
* @throws IOException
*/
public InputStream reConnect() throws IOException
{
DownloadController downloadController = documentClosure.reConnect();
this.downloadController = downloadController;
return downloadController.getHttpResponse().getContentAsStream();
}
/**
* @return The input stream to the raw document being downloaded, if any.
* @throws UnsupportedEncodingException
*/
protected InputStream inputStream() throws UnsupportedEncodingException
{
if (downloadController != null)
{
return downloadController.getHttpResponse().getContentAsStream();
}
return null;
}
/**
* @return The reader of the raw document.
*/
protected Reader reader()
{
return null;
}
/**
* Handle I/O error.
*
* @param e
*/
public void handleIoError(Throwable e)
{
recycle();
}
/**
* Free resources.
*/
public void recycle()
{
semanticsScope = null;
documentClosure = null;
}
/**
* True if our analysis indicates the present AbstractContainer is an article, and not a
* collection of links. This affects calls to getWeight() in the model!
*
* @return true for an article. false for a collection of links (like a homepage).
*/
public boolean isAnArticle()
{
return true;
}
public boolean isIndexPage()
{
return false;
}
public boolean isContentPage()
{
return false;
}
/**
* Differentiates referential documents, like HTML from composite documents, like PDF.
*
* @return true if images referred to by this document are stored within the document itself. The
* default implementation, here returns false.
*/
public boolean isCompositeDocument()
{
return false;
}
/**
* @return true to avoid building a DOM at the beginning.
*/
public boolean doesDirectBinding()
{
return false;
}
public String toString()
{
ParsedURL purl = purl();
String purlString = (purl != null) ? purl.toString() : null;
if (purlString == null)
purlString = "no purl";
return super.toString() + "[" + purlString + "]";
}
}