package eu.europeana.cloud.service.dps.similarity;
import eu.europeana.cloud.common.web.ParamConstants;
import eu.europeana.cloud.mcs.driver.FileServiceClient;
import eu.europeana.cloud.service.dps.index.IndexFields;
import eu.europeana.cloud.service.dps.index.Indexer;
import eu.europeana.cloud.service.dps.index.IndexerFactory;
import eu.europeana.cloud.service.dps.index.exception.IndexerException;
import eu.europeana.cloud.service.dps.index.structure.IndexerInformations;
import eu.europeana.cloud.service.dps.index.structure.SearchHit;
import eu.europeana.cloud.service.dps.index.structure.SearchResult;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Service for search similar and duplicate documents.
* @author Pavel Kefurt <Pavel.Kefurt@gmail.com>
*/
public class SimilarityService
{
private final float duplicationThreshold = (float)0.98; //1 = 100%
private final Indexer client;
private static final Logger LOGGER = LoggerFactory.getLogger(SimilarityService.class);
private Map<String, List<String>> fields;
/**
* Construct object with default fields.
* @param indexer name of indexer ({@link eu.europeana.cloud.service.dps.index.SupportedIndexers SupportedIndexers})
* @param index index name
* @param type type name ({@link eu.europeana.cloud.service.dps.index.Solr Solr} indexer ignore this parameter)
* @param addresses indexer servers addresses (separated by semicolon)
* @throws IndexerException
*/
public SimilarityService(String indexer, String index, String type, String addresses) throws IndexerException
{
this(new IndexerInformations(indexer, index, type, addresses));
}
/**
* Construct object with custom fields.
* @param indexer name of indexer ({@link eu.europeana.cloud.service.dps.index.SupportedIndexers SupportedIndexers})
* @param index index name
* @param type type name ({@link eu.europeana.cloud.service.dps.index.Solr Solr} indexer ignore this parameter)
* @param addresses indexer servers addresses (separated by semicolon)
* @param fields selected fields for every representation. This fields will be used for search similarity and duplicity
* Map structure: <"representation name", List<"field name">> (null means all possible fields)
* @throws IndexerException
*/
public SimilarityService(String indexer, String index, String type, String addresses,
Map<String, List<String>> fields) throws IndexerException
{
this(new IndexerInformations(indexer, index, type, addresses), fields);
}
/**
* Construct object with default fields.
* @param ii instance of IndexerInformations
* @throws IndexerException
*/
public SimilarityService(IndexerInformations ii) throws IndexerException
{
Indexer tmp = IndexerFactory.getIndexer(ii);
if(tmp == null)
{
LOGGER.warn("No indexer.");
throw new IndexerException("Unsupported indexer.");
}
client = tmp;
initFields();
}
/**
* Construct object with custom fields.
* @param ii instance of IndexerInformations
* @param fields selected fields for every representation. This fields will be used for search similarity and duplicity
* Map structure: <"representation name", List<"field name">> (null means all possible fields)
* @throws IndexerException
*/
public SimilarityService(IndexerInformations ii, Map<String, List<String>> fields) throws IndexerException
{
Indexer tmp = IndexerFactory.getIndexer(ii);
if(tmp == null)
{
LOGGER.warn("No indexer.");
throw new IndexerException("Unsupported indexer.");
}
client = tmp;
this.fields = fields;
}
/**
* Retrieve similar documents for reference document.
* @param documentId unique identifier of reference document
* @return instance of SearchResult or null if documentId is not set
* @throws IndexerException
*/
public SearchResult getSimilarDocuments(String documentId) throws IndexerException
{
if(documentId == null)
{
return null;
}
List<String> tmp = getFields(documentId);
String[] _fields = tmp != null ? tmp.toArray(new String[tmp.size()]) : null;
return client.getMoreLikeThis(documentId, _fields);
}
/**
* Retrieve similar documents for reference document.
* @param documentId unique identifier of reference document
* @param limit maximum number of results
* @return instance of SearchResult or null if documentId is not set
* @throws IndexerException
*/
public SearchResult getSimilarDocuments(String documentId, int limit) throws IndexerException
{
if(documentId == null || limit <= 0)
{
return null;
}
List<String> tmp = getFields(documentId);
String[] _fields = tmp != null ? tmp.toArray(new String[tmp.size()]) : null;
return client.getMoreLikeThis(documentId, _fields, limit, 0);
}
/**
* Retrieve duplicate documents for reference document.
* @param documentId unique identifier of reference document
* @return list of SearchHits or null if documentId is not set
* @throws IndexerException
*/
public List<SearchHit> calcDuplicateDocuments(String documentId) throws IndexerException
{
if(documentId == null)
{
return null;
}
List<String> tmp = getFields(documentId);
String[] _fields = tmp != null ? tmp.toArray(new String[tmp.size()]) : null;
SearchResult result = client.getMoreLikeThis(documentId, _fields,
50, 2, Indexer.MIN_DOC_FREQ, Indexer.MAX_DOC_FREQ,
Indexer.MIN_WORD_LENGTH, Indexer.MAX_WORD_LENGTH, Indexer.PAGE_SIZE, 0, true);
SearchHit reference = null;
for(SearchHit sh: result.getHits())
{
if(documentId.equals(sh.getId()))
{
reference = sh;
result.getHits().remove(sh);
break;
}
}
if(reference == null)
{
return new ArrayList();
}
float threshold = reference.getScore() * duplicationThreshold;
List<SearchHit> res = new ArrayList();
for(SearchHit sh: result.getHits())
{
if(sh.getScore() >= threshold)
{
res.add(sh);
}
}
return res;
}
private List<String> getFields(String url)
{
if(fields == null || fields.isEmpty() || url == null)
{
return null;
}
Map<String, String> parsed = FileServiceClient.parseFileUri(url);
String repName = parsed.get(ParamConstants.P_REPRESENTATIONNAME);
return fields.get(repName.toUpperCase());
}
/**
* Fields initializer.
* Structure: Map<"representation name", List<"field name">> (null means all possible fields)
*/
protected void initFields()
{
List<String> pdf = new ArrayList();
pdf.add(IndexFields.RAW_TEXT.toString());
pdf.add("another_field");
String[] oai =
{
"description",
"title"
};
Map<String, List<String>> _fields = new HashMap<>();
_fields.put("PDF", pdf);
_fields.put("TXT", null);
_fields.put("OAI", Arrays.asList(oai));
}
}