package org.carrot2.core;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.simpleframework.xml.Attribute;
import org.simpleframework.xml.Element;
import org.simpleframework.xml.ElementList;
import org.simpleframework.xml.core.Commit;
import org.simpleframework.xml.core.Persist;
import com.google.common.base.Function;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
public class Document
{
/** Field name for the title of the document. */
public static final String TITLE = "title";
/**
* Field name for a short summary of the document, e.g. the snippet returned by the
* search engine.
*/
public static final String SUMMARY = "snippet";
/** Field name for an URL pointing to the full version of the document. */
public static final String CONTENT_URL = "url";
/**
* Click URL. The URL that should be placed in the anchor to the document instead of
* the value returned in {@link #CONTENT_URL}.
*/
public static final String CLICK_URL = "click-url";
/**
* Field name for an URL pointing to the thumbnail image associated with the document.
*/
public static final String THUMBNAIL_URL = "thumbnail-url";
/** Document size. */
public static final String SIZE = "size";
/**
* Field name for a list of sources the document was found in. Value type:
* <code>List<String></code>
*/
public static final String SOURCES = "sources";
/**
* Field name for the language in which the document is written. Value type:
* {@link LanguageCode}. If the <code>language</code> field is not defined or is
* <code>null</code>, it means the language of the document is unknown or it is
* outside of the list defined in {@link LanguageCode}.
*/
public static final String LANGUAGE = "language";
/**
* Identifiers of reference clustering partitions this document belongs to. Currently,
* this field is used only to calculate various clustering quality metrics. In the
* future, clustering algorithms may be able to use values of this field to increase
* the quality of clustering.
* <p>
* Value type: <code>Collection<Object></code>. There is no constraint on the
* actual type of the partition identifier in the collection. Identifiers are assumed
* to correctly implement the {@link #equals(Object)} and {@link #hashCode()} methods.
* </p>
*/
public static final String PARTITIONS = "partitions";
/** Fields of this document */
private String titleVal = StringUtils.EMPTY;
private String summaryVal = StringUtils.EMPTY;
private String urlVal = StringUtils.EMPTY;
private LanguageCode langVal = null;
/**
* Internal identifier of the document. This identifier is assigned dynamically after
* documents are returned from {@link IDocumentSource}.
*
* @see ProcessingResult
*/
@Attribute(required = false)
Integer id;
/**
* Creates an empty document with no fields.
*/
public Document()
{
}
/**
* Creates a document with the provided <code>title</code>.
*/
public Document(String title)
{
this(title, null);
}
/**
* Creates a document with the provided <code>title</code> and <code>summary</code>.
*/
public Document(String title, String summary)
{
this(title, summary, (String) null);
}
/**
* Creates a document with the provided <code>title</code>, <code>summary</code> and
* <code>language</code>.
*/
public Document(String title, String summary, LanguageCode language)
{
this(title, summary, null, language);
}
/**
* Creates a document with the provided <code>title</code>, <code>summary</code> and
* <code>contentUrl</code>.
*/
public Document(String title, String summary, String contentUrl)
{
this(title, summary, contentUrl, null);
}
/**
* Creates a document with the provided <code>title</code>, <code>summary</code>,
* <code>contentUrl</code> and <code>language</code>.
*/
public Document(String title, String summary, String contentUrl, LanguageCode language)
{
this.titleVal = title;
this.summaryVal = summary;
this.urlVal = contentUrl;
this.langVal = language;
}
/**
* A unique identifier of this document. The identifiers are assigned to documents
* before processing finishes. Note that two documents with equal contents will be
* assigned different identifiers.
*
* @return unique identifier of this document
*/
public Integer getId()
{
return id;
}
/**
* Returns this document's {@link #TITLE} field.
*/
@Element(required = false)
public String getTitle()
{
return this.titleVal;
}
/**
* Sets this document's {@link #TITLE} field.
*
* @param title title to set
* @return this document for convenience
*/
@Element(required = false)
public Document setTitle(String title)
{
this.titleVal = title;
return this;
}
/**
* Returns this document's {@link #SUMMARY} field.
*/
@Element(name = "snippet", required = false)
public String getSummary()
{
return this.summaryVal;
}
/**
* Sets this document's {@link #SUMMARY} field.
*
* @param summary summary to set
* @return this document for convenience
*/
@Element(name = "snippet", required = false)
public Document setSummary(String summary)
{
this.summaryVal = summary;
return this;
}
/**
* Returns this document's {@link #CONTENT_URL} field.
*/
@Element(name = "url", required = false)
public String getContentUrl()
{
return this.urlVal;
}
/**
* Sets this document's {@link #CONTENT_URL} field.
*
* @param contentUrl content URL to set
* @return this document for convenience
*/
@Element(name = "url", required = false)
public Document setContentUrl(String contentUrl)
{
this.urlVal = contentUrl;
return this;
}
/**
* Returns this document's {@link #SOURCES} field.
*/
@ElementList(entry = "source", required = false)
public List<String> getSources()
{
return null;
}
/**
* Sets this document's {@link #SOURCES} field.
*
* @param sources the sources list to set
* @return this document for convenience
*/
@ElementList(entry = "source", required = false)
public Document setSources(List<String> sources)
{
System.out.println("set sources");
return this;
}
/**
* Returns this document's {@link #LANGUAGE}.
*/
public LanguageCode getLanguage()
{
return this.langVal;
}
/**
* Sets this document's {@link #LANGUAGE}.
*
* @param language the language to set
* @return this document for convenience
*/
public Document setLanguage(LanguageCode language)
{
this.langVal = language;
return this;
}
@SuppressWarnings("unused")
@Attribute(required = false, name = "language")
private String getLanguageIsoCode()
{
final LanguageCode language = getLanguage();
return language != null ? language.getIsoCode() : null;
}
@SuppressWarnings("unused")
@Attribute(required = false, name = "language")
private void setLanguageIsoCode(String languageIsoCode)
{
if (languageIsoCode != null)
{
final LanguageCode language = LanguageCode.forISOCode(languageIsoCode);
if (language != null)
{
setLanguage(language);
}
else
{
// Try by enum name for backward-compatibility
setLanguage(LanguageCode.valueOf(languageIsoCode));
}
}
else
{
setLanguage(null);
}
}
/**
* For JSON and XML serialization only.
*/
@SuppressWarnings("unused")
private Map<String, Object> getOtherFields()
{
return null;
}
/**
* Returns all fields of this document. The returned map is unmodifiable.
*
* @return all fields of this document
*/
public Map<String, Object> getFields()
{
return null;
}
/**
* Returns value of the specified field of this document. If no field corresponds to
* the provided <code>name</code>, <code>null</code> will be returned.
*
* @param name of the field to be returned
* @return value of the field or <code>null</code>
*/
@SuppressWarnings("unchecked")
public <T> T getField(String name)
{
T val = null;
switch ( name.charAt(0)) {
case 't':
val = (T) this.titleVal;
break;
case 's':
val = (T) this.summaryVal;
break;
}
return val;
}
/**
* Sets a field in this document.
*
* @param name of the field to set
* @param value value of the field
* @return this document for convenience
*/
public Document setField(String name, Object value)
{
switch ( name.charAt(0)) {
case 't':
this.titleVal = (String) value;
break;
case 's':
this.summaryVal = (String) value;
break;
}
return this;
}
/**
* Assigns sequential identifiers to the provided <code>documents</code>. If a
* document already has an identifier, the identifier will not be changed.
*
* @param documents documents to assign identifiers to.
* @throws IllegalArgumentException if the provided documents contain non-unique
* identifiers
*/
public static void assignDocumentIds(Collection<Document> documents)
{
// We may get concurrent calls referring to the same documents
// in the same list, so we need to synchronize here.
synchronized (documents)
{
final HashSet<Integer> ids = Sets.newHashSet();
// First, find the start value for the id, check uniqueness of the ids
// already provided and erase duplicated ids.
int maxId = Integer.MIN_VALUE;
for (final Document document : documents)
{
if (document.id != null)
{
if (ids.add(document.id))
{
maxId = Math.max(maxId, document.id);
}
else
{
document.id = null;
}
}
}
// We'd rather start with 0
maxId = Math.max(maxId, -1);
// Assign missing ids
for (final Document document : documents)
{
if (document.id == null)
{
document.id = ++maxId;
}
}
}
}
/**
* Transforms a {@link Document} to its identifier returned by
* {@link Document#getId()}.
*/
public static final class DocumentToId implements Function<Document, Integer>
{
public static final DocumentToId INSTANCE = new DocumentToId();
private DocumentToId()
{
}
public Integer apply(Document document)
{
return document.id;
}
}
/**
* Compares {@link Document}s by their identifiers {@link #getId()}, which effectively
* gives the original order in which they were returned by the document source.
*/
public static final Comparator<Document> BY_ID_COMPARATOR = Ordering.natural()
.nullsFirst().onResultOf(DocumentToId.INSTANCE);
/**
* Transfers some fields from the map to individual class fields.
*/
@Persist
@SuppressWarnings(
{
"unused"
})
private void beforeSerialization()
{
}
/**
* Transfers values of class field to the field map.
*/
@Commit
@SuppressWarnings("unused")
private void afterDeserialization() throws Throwable
{
}
}