/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.vsm;
import org.carrot2.matrix.MatrixAssertions;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.junit.Test;
import com.carrotsearch.hppc.IntIntHashMap;
/**
* Test cases for {@link TermDocumentMatrixBuilder}.
*/
public class TermDocumentMatrixBuilderTest extends TermDocumentMatrixBuilderTestBase
{
@Test
public void testEmpty()
{
int [] expectedTdMatrixStemIndices = new int [] {};
double [][] expectedTdMatrixElements = new double [] [] {};
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testSingleWords()
{
createDocuments("", "aa . bb", "", "bb . cc", "", "aa . cc . cc");
int [] expectedTdMatrixStemIndices = new int []
{
2, 0, 1
};
double [][] expectedTdMatrixElements = new double [] []
{
{
0, 1, 2
},
{
1, 0, 1
},
{
1, 1, 0
}
};
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testSinglePhrase()
{
createDocuments("", "aa bb cc", "", "aa bb cc", "", "aa bb cc");
int [] expectedTdMatrixStemIndices = new int []
{
0, 1, 2
};
double [][] expectedTdMatrixElements = new double [] []
{
{
1, 1, 1
},
{
1, 1, 1
},
{
1, 1, 1
},
};
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testSinglePhraseWithSingleWords()
{
createDocuments("", "aa bb cc", "", "aa bb cc", "", "aa bb cc", "",
"ff . gg . ff . gg");
preprocessingPipeline.documentAssigner.minClusterSize = 1;
int [] expectedTdMatrixStemIndices = new int []
{
0, 1, 2, 3, 4
};
double [][] expectedTdMatrixElements = new double [] []
{
{
1, 1, 1, 0
},
{
1, 1, 1, 0
},
{
1, 1, 1, 0
},
{
0, 0, 0, 2
},
{
0, 0, 0, 2
},
};
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testSinglePhraseWithStopWord()
{
createDocuments("", "aa stop cc", "", "aa stop cc", "", "aa stop cc");
int [] expectedTdMatrixStemIndices = new int []
{
0, 1
};
double [][] expectedTdMatrixElements = new double [] []
{
{
1, 1, 1
},
{
1, 1, 1
}
};
System.out.println(context);
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testMatrixSizeLimit()
{
createDocuments("", "aa . aa", "", "bb . bb . bb", "", "cc . cc . cc . cc");
preprocessingPipeline.documentAssigner.minClusterSize = 1;
int [] expectedTdMatrixStemIndices = new int []
{
2, 1
};
double [][] expectedTdMatrixElements = new double [] []
{
{
0, 0, 4
},
{
0, 3, 0
}
};
matrixBuilder.maximumMatrixSize = 3 * 2;
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testTitleWordBoost()
{
createDocuments("aa", "bb", "", "bb . cc", "", "aa . cc . cc");
int [] expectedTdMatrixStemIndices = new int []
{
0, 2, 1
};
double [][] expectedTdMatrixElements = new double [] []
{
{
2, 0, 2
},
{
0, 1, 2
},
{
1, 1, 0
}
};
check(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
@Test
public void testCarrot905()
{
createDocuments("", "aa . bb", "", "bb . cc", "", "aa . cc . cc");
PreprocessingContext context = preprocessingPipeline.preprocess(
this.context.documents,
this.context.query,
this.context.language.getLanguageCode());
// The preprocessing pipeline will produce increasing indices in tfByDocument,
// so to reproduce the bug, we need to perturb them, e.g. reverse.
final int [][] tfByDocument = context.allStems.tfByDocument;
for (int s = 0; s < tfByDocument.length; s++)
{
final int [] stemTfByDocument = tfByDocument[s];
for (int i = 0; i < stemTfByDocument.length / 4; i++)
{
int t = stemTfByDocument[i * 2];
stemTfByDocument[i * 2] = stemTfByDocument[(stemTfByDocument.length / 2 - i - 1) * 2];
stemTfByDocument[(stemTfByDocument.length / 2 - i - 1) * 2] = t;
t = stemTfByDocument[i * 2 + 1];
stemTfByDocument[i * 2 + 1] = stemTfByDocument[(stemTfByDocument.length / 2 - i - 1) * 2 + 1];
stemTfByDocument[(stemTfByDocument.length / 2 - i - 1) * 2 + 1] = t;
}
}
vsmContext = new VectorSpaceModelContext(context);
matrixBuilder.buildTermDocumentMatrix(vsmContext);
matrixBuilder.buildTermPhraseMatrix(vsmContext);
int [] expectedTdMatrixStemIndices = new int []
{
2, 0, 1
};
double [][] expectedTdMatrixElements = new double [] []
{
{
0, 1, 2
},
{
1, 0, 1
},
{
1, 1, 0
}
};
checkOnly(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
private void check(double [][] expectedTdMatrixElements,
int [] expectedTdMatrixStemIndices)
{
buildTermDocumentMatrix();
checkOnly(expectedTdMatrixElements, expectedTdMatrixStemIndices);
}
void checkOnly(double [][] expectedTdMatrixElements,
int [] expectedTdMatrixStemIndices)
{
assertThat(vsmContext.termDocumentMatrix.rows()).as("tdMatrix.rowCount")
.isEqualTo(expectedTdMatrixStemIndices.length);
MatrixAssertions.assertThat(vsmContext.termDocumentMatrix).isEquivalentTo(
expectedTdMatrixElements);
final IntIntHashMap expectedStemToRowIndex = new IntIntHashMap();
for (int i = 0; i < expectedTdMatrixStemIndices.length; i++)
{
expectedStemToRowIndex.put(expectedTdMatrixStemIndices[i], i);
}
assertThat((Object) vsmContext.stemToRowIndex).isEqualTo(expectedStemToRowIndex);
}
}