/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.preprocessing;
import org.carrot2.text.linguistic.DefaultLexicalDataFactory;
import org.carrot2.text.linguistic.DefaultStemmerFactory;
import org.carrot2.text.linguistic.ILexicalDataFactory;
import org.carrot2.text.linguistic.IStemmerFactory;
import org.junit.Before;
import org.junit.Test;
/**
* Test cases for {@link DocumentAssigner}.
*/
public class DocumentAssignerTest extends LabelFilterTestBase
{
/** Document assigner under tests */
private DocumentAssigner documentAssigner;
@Before
public void setUpDocumentAssigner()
{
documentAssigner = new DocumentAssigner();
}
@Override
protected void initializeFilters(LabelFilterProcessor filterProcessor)
{
filterProcessor.stopWordLabelFilter.enabled = true;
filterProcessor.completeLabelFilter.enabled = true;
}
@Test
public void testEmpty()
{
final int [][] expectedDocumentIndices = new int [] [] {};
check(expectedDocumentIndices, -1);
}
@Test
public void testSingleWordLabels()
{
createDocuments("coal is", "coal is", "mining", "mining");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0
},
new int []
{
1
}
};
documentAssigner.minClusterSize = 1;
check(expectedDocumentIndices, -1);
}
@Test
public void testStemmedSingleWordLabelConflation()
{
createDocuments("cat", "cat", "cat", "cat", "cats", "cats", "cats", "cats");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0, 1, 2, 3
}
};
documentAssigner.minClusterSize = 1;
check(expectedDocumentIndices, -1);
}
@Test
public void testStemmedPhraseLabelConflation()
{
createDocuments("cat horse", "cat horse", "cats horse", "cats horse",
"cat horses", "cat horses", "cats horses", "cats horses");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0, 1, 2, 3
},
new int []
{
0, 1, 2, 3
},
new int []
{
0, 1, 2, 3
}
};
documentAssigner.minClusterSize = 1;
check(expectedDocumentIndices, 2);
}
@Test
public void testMinClusterSize()
{
createDocuments("test coal", "test coal", "coal test . mining",
"coal test . mining");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0, 1
},
new int []
{
0, 1
},
new int []
{
0, 1
},
new int []
{
0, 1
}
};
documentAssigner.minClusterSize = 2;
check(expectedDocumentIndices, 2);
}
@Test
public void testPhraseLabelsExactMatch()
{
createDocuments("data is cool", "data is cool", "data is cool", "data is cool",
"data cool", "data cool");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0, 1
}
};
documentAssigner.exactPhraseAssignment = true;
documentAssigner.minClusterSize = 2;
check(expectedDocumentIndices, 0);
}
@Test
public void testPhraseLabelsNonExactMatch()
{
createDocuments("data is cool", "data is cool", "data is cool", "data is cool",
"data cool", "data cool");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0, 1, 2
},
new int []
{
0, 1, 2
}
};
documentAssigner.exactPhraseAssignment = false;
documentAssigner.minClusterSize = 2;
check(expectedDocumentIndices, 0);
}
@Test
public void testPhraseLabelsNonExactMatchOtherLabels()
{
createDocuments("aa bb cc dd", "aa bb cc dd", "dd . cc . bb . aa",
"dd . cc . bb . aa", "cc . bb . aa", "aa . bb . cc");
final int [][] expectedDocumentIndices = new int [] []
{
new int []
{
0, 1, 2
},
new int []
{
0, 1, 2
},
new int []
{
0, 1, 2
},
new int []
{
0, 1
},
new int []
{
0, 1
}
};
check(expectedDocumentIndices, 4);
}
private void check(int [][] expectedDocumentIndices, int expectedFirstPhraseIndex)
{
runPreprocessing();
documentAssigner.assign(context);
assertThat(context.allLabels.firstPhraseIndex).as("allLabels.firstPhraseIndex")
.isEqualTo(expectedFirstPhraseIndex);
assertThat(context.allLabels.documentIndices).as("allLabels.documentIndices")
.hasSize(expectedDocumentIndices.length);
for (int i = 0; i < expectedDocumentIndices.length; i++)
{
assertThat(context.allLabels.documentIndices[i].asIntLookupContainer().toArray()).as(
"allLabels.documentIndices[" + i + "]").isEqualTo(
expectedDocumentIndices[i]);
}
}
@Override
protected ILexicalDataFactory createLexicalDataFactory()
{
return new DefaultLexicalDataFactory();
}
@Override
protected IStemmerFactory createStemmerFactory()
{
return new DefaultStemmerFactory();
}
}