/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.clustering.carrot2; import java.util.Map; import java.util.Set; import org.apache.nutch.searcher.HitDetails; import org.carrot2.core.LocalInputComponentBase; import org.carrot2.core.ProcessingException; import org.carrot2.core.RequestContext; import org.carrot2.core.clustering.RawDocumentsConsumer; import org.carrot2.core.clustering.RawDocumentsProducer; /** * An input component that ignores the query passed from the * controller and instead looks for data stored in the request context. * This enables us to reuse the same physical component implementation * for data that has already been acquired from Nutch. */ public class NutchInputComponent extends LocalInputComponentBase { public final static String NUTCH_INPUT_HIT_DETAILS_ARRAY = "NUTCH_INPUT_HIT_DETAILS_ARRAY"; public final static String NUTCH_INPUT_SUMMARIES_ARRAY = "NUTCH_INPUT_SUMMARIES_ARRAY"; /** Capabilities required from the next component in the chain */ private final static Set SUCCESSOR_CAPABILITIES = toSet(RawDocumentsConsumer.class); /** This component's capabilities */ private final static Set COMPONENT_CAPABILITIES = toSet(RawDocumentsProducer.class); /** * Default language code for hits that don't have their own. */ private String defaultLanguage; /** * Creates an input component with the given default language code. */ public NutchInputComponent(String defaultLanguage) { this.defaultLanguage = defaultLanguage; } /* * @see com.dawidweiss.carrot.core.local.LocalInputComponent#setQuery(java.lang.String) */ public void setQuery(String query) { // ignore the query; data will be provided from the request context. } /** * A callback hook that starts the processing. */ public void startProcessing(RequestContext context) throws ProcessingException { // let successor components know that the processing has started. super.startProcessing(context); // get the information about documents from the context. final Map params = context.getRequestParameters(); final HitDetails [] details = (HitDetails[]) params.get(NUTCH_INPUT_HIT_DETAILS_ARRAY); final String [] summaries = (String[]) params.get(NUTCH_INPUT_SUMMARIES_ARRAY); if (details == null) throw new ProcessingException("Details array must not be null."); if (summaries == null) throw new ProcessingException("Summaries array must not be null."); if (summaries.length != details.length) throw new ProcessingException("Summaries and details must be of the same length."); // produce 'documents' for successor components. final RawDocumentsConsumer consumer = (RawDocumentsConsumer) next; for (int i = 0; i < summaries.length; i++) { consumer.addDocument(new NutchDocument(i, details[i], summaries[i], defaultLanguage)); } } /** * Returns the capabilities provided by this component. */ public Set getComponentCapabilities() { return COMPONENT_CAPABILITIES; } /** * Returns the capabilities required from the successor component. */ public Set getRequiredSuccessorCapabilities() { return SUCCESSOR_CAPABILITIES; } }