/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.ml.modelinput; import org.elasticsearch.common.collect.Tuple; import java.util.ArrayList; import java.util.List; import java.util.Map; /** * Represents an abstract data source that could provide information about a single record. */ public interface DataSource { /** * Returns a list of values for the given field */ <T> List<T> getValues(String field); /** * Returns an array of 0s and 1s. 1 if the corresponding term in the terms array is present in the field and 0 otherwise. */ double[] getOccurrenceDense(String[] terms, String field); /** * Returns an array of TF/IDF values for the terms in the specified field */ double[] getTfIdfDense(String[] terms, String field); /** * Returns an array of TF values for the terms in the specified field */ double[] getTfDense(String[] terms, String field); /** * Returns a sparse array of 0s and 1s. 1 if the corresponding term in the wordMap is present in the field and 0 otherwise. */ default Tuple<int[], double[]> getOccurrenceSparse(Map<String, Integer> wordMap, String field) { List<String> docValues = getValues(field); Tuple<int[], double[]> indicesAndValues; List<Integer> indices = new ArrayList<>(); for (String value : docValues) { Integer index = wordMap.get(value); if (index != null) { indices.add(index); } } int[] indicesArray = new int[indices.size()]; double[] valuesArray = new double[indices.size()]; for (int i = 0; i < indices.size(); i++) { indicesArray[i] = indices.get(i); valuesArray[i] = 1; } indicesAndValues = new Tuple<>(indicesArray, valuesArray); return indicesAndValues; } /** * Returns a sparse array of TF/IDF values for the terms in the specified field */ Tuple<int[], double[]> getTfIdfSparse(Map<String, Integer> wordMap, String field); /** * Returns a sparse array of TF values for the terms in the specified field */ Tuple<int[], double[]> getTfSparse(Map<String, Integer> wordMap, String field); }