/* Copyright 2012 Tim Garrett, Mothsoft LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mothsoft.alexis.engine.textual;
import org.apache.commons.math3.linear.OpenMapRealVector;
import org.apache.commons.math3.linear.RealVector;
import org.apache.commons.math3.linear.SparseRealVector;
import com.mothsoft.alexis.domain.Document;
import com.mothsoft.alexis.domain.DocumentAssociation;
import com.mothsoft.alexis.domain.DocumentNamedEntity;
import com.mothsoft.alexis.domain.DocumentTerm;
/**
* Domain object to encapsulate features of a document and allow finding its
* similarity with other documents through set operations
*
*/
public class DocumentFeatures {
private final SparseRealVector termVector;
private final SparseRealVector associationVector;
private final SparseRealVector nameVector;
public DocumentFeatures(final Document document, final DocumentFeatureContext context) {
this.termVector = new OpenMapRealVector(Integer.MAX_VALUE);
this.associationVector = new OpenMapRealVector(Integer.MAX_VALUE);
this.nameVector = new OpenMapRealVector(Integer.MAX_VALUE);
for (final DocumentAssociation association : document.getDocumentAssociations()) {
final Integer id = context.getContextId(association);
increment(associationVector, id, 1);
}
for (final DocumentTerm documentTerm : document.getDocumentTerms()) {
final Integer termId = context.getContextId(documentTerm.getTerm());
increment(termVector, termId, documentTerm.getCount());
}
for (final DocumentNamedEntity entity : document.getNamedEntities()) {
final Integer id = context.getContextId(entity);
increment(nameVector, id, 1);
}
}
private void increment(RealVector vector, Integer id, int increment) {
vector.addToEntry(id, increment);
}
public double cosineSimilarity(final DocumentFeatures other) {
final double cosineTerms = cosine(this.termVector, other.termVector);
final double cosineAssociations = cosine(this.associationVector, other.associationVector);
final double cosineNames = cosine(this.nameVector, other.nameVector);
// FIXME - more mathematical approach than equal parts?
return (cosineTerms + cosineAssociations + cosineNames) / 3.0;
}
public double cosine(final RealVector v1, final RealVector v2) {
final double norm = v1.getNorm();
final double norm2 = v2.getNorm();
final double divisor = norm * norm2;
if (divisor == 0.0) {
return 0;
} else {
return v1.dotProduct(v2) / (divisor);
}
}
}