/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing; import java.util.List; import java.util.Map; import org.carrot2.core.Document; import org.carrot2.core.LanguageCode; import org.carrot2.text.linguistic.DefaultLexicalDataFactory; import org.carrot2.text.linguistic.DefaultStemmerFactory; import org.carrot2.text.linguistic.DefaultTokenizerFactory; import org.carrot2.text.linguistic.ILexicalDataFactory; import org.carrot2.text.linguistic.IStemmerFactory; import org.carrot2.text.linguistic.ITokenizerFactory; import org.carrot2.text.linguistic.LanguageModel; import org.carrot2.util.tests.CarrotTestCase; import org.junit.Before; import org.carrot2.shaded.guava.common.collect.Lists; import org.carrot2.shaded.guava.common.collect.Maps; /** * Base class for {@link PreprocessingContext} tasks tests. */ public class PreprocessingComponentTestBase extends CarrotTestCase { /** Preprocessing context for the component being tested */ protected PreprocessingContext context; /** Documents each test sets up */ private List<Document> documents; /** Word image to index mapping */ protected Map<String, Integer> wordIndices; @Before public void setUpPreprocessingInfrastructure() { documents = Lists.newArrayList(); createPreprocessingContext(null); } /** * Creates the {@link PreprocessingContext} for tests. */ protected void createPreprocessingContext(String query) { context = createPreprocessingContext(query, documents); } private PreprocessingContext createPreprocessingContext(String query, final List<Document> documents) { return new PreprocessingContext( LanguageModel.create(LanguageCode.ENGLISH, createStemmerFactory(), createTokenizerFactory(), createLexicalDataFactory()), documents, query); } /** * Creates the {@link ITokenizerFactory} to be used in tests. This implementation * returns a new instance of {@link DefaultTokenizerFactory}, override to use a * different factory. */ protected ITokenizerFactory createTokenizerFactory() { return new DefaultTokenizerFactory(); } /** * Creates the {@link IStemmerFactory} to be used in tests. This implementation * returns a new instance of {@link DefaultStemmerFactory}, override to use a * different factory. */ protected IStemmerFactory createStemmerFactory() { return new DefaultStemmerFactory(); } /** * Creates the {@link ILexicalDataFactory} to be used in tests. This implementation * returns a new instance of {@link DefaultLexicalDataFactory}, override to use a * different factory. */ protected ILexicalDataFactory createLexicalDataFactory() { return new DefaultLexicalDataFactory(); } /** * A utility method for creating documents for tests. See subclasses for usage * examples. * * @param fields names of fields to create * @param fieldValues values for fields, for each <code>fields.length</code> values, * one document will be created. */ protected void createDocumentsWithFields(String [] fields, String... fieldValues) { int fieldValuesIndex = 0; while (fieldValuesIndex < fieldValues.length) { Document document = new Document(); for (String fieldName : fields) { document.setField(fieldName, fieldValues[fieldValuesIndex++]); if (fieldValuesIndex >= fieldValues.length) { break; } } documents.add(document); } Document.assignDocumentIds(documents); prepareWordIndices(); } /** * Creates documents with {@link #DEFAULT_DOCUMENT_FIELD_NAMES}. */ protected void createDocuments(String... fieldValues) { createDocumentsWithFields(DEFAULT_DOCUMENT_FIELD_NAMES, fieldValues); } /** * Default field names. */ final static String [] DEFAULT_DOCUMENT_FIELD_NAMES = new String [] { Document.TITLE, Document.SUMMARY }; /** * Initializes the map with word image to index. */ protected void prepareWordIndices() { final Tokenizer temporaryTokenizer = new Tokenizer(); final CaseNormalizer temporaryCaseNormalizer = new CaseNormalizer(); final PreprocessingContext temporaryContext = createPreprocessingContext(null, documents); temporaryTokenizer.tokenize(temporaryContext); temporaryCaseNormalizer.normalize(temporaryContext); final char [][] images = temporaryContext.allWords.image; wordIndices = Maps.newHashMap(); for (int i = 0; i < images.length; i++) { wordIndices.put(new String(images[i]), i); } } }