/* Copyright (c) 2006 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package sample.gbase.recipe; import com.google.api.gbase.client.AttributeHistogram; import com.google.api.gbase.client.FeedURLFactory; import com.google.api.gbase.client.GoogleBaseEntry; import com.google.api.gbase.client.GoogleBaseFeed; import com.google.api.gbase.client.GoogleBaseQuery; import com.google.api.gbase.client.GoogleBaseService; import com.google.api.gbase.client.MetadataEntryExtension; import com.google.api.gbase.client.AttributeHistogram.UniqueValue; import com.google.gdata.util.ServiceException; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Hashtable; import java.util.Iterator; import java.util.List; import java.util.Timer; import java.util.TimerTask; import javax.servlet.ServletContext; /** * Holds the most used values of attributes in Google Base and * periodically refreshes them. * * A MostUsedValues object is focused on a GoogleBaseService and * a FeedURLFactory. One object can be used to analyse only the items that * match a specific query. * * The class currently works for TEXT attributes only. * * The cache() and clear() calls are synchronized, * so that the different Maps and Collections remain in sync. * * Typically, right after you create a MostUsedValues object, you call the * {@link #cache(long, int, ServletContext, String[])} method to set it up * to cache the most used values of your attributes. This creates * {@link java.util.TimerTask} objects that periodically refresh the cached * values. After this, you use {@link #getMostUsedValuesForAttribute(String)} * to get the most used values for the attributes you previously specified. * In the end, when you no longer need the cache, you can call * {@link #clear()} to stop the TimerTask objects and clear the cache. */ public class MostUsedValues { protected static final String TEXT_TYPE = "(text)"; /** * The initial value of the max values limit when making a query; * the step used to increase that limit if some of the attributes * are not found. */ protected static final int STEP_MAXRESULTS = 25; /** * The max of the max values limit when making a query. */ protected static final int MAX_MAXRESULTS = 200; /** * The cached most used values. * This is a synchronized map. */ private java.util.Map<String, String[]> mostUsedValues; /** * The timers that periodically refresh the cache. * The access to this object has to be synchronized. */ private Collection<Timer> timers; private GoogleBaseService service; private FeedURLFactory urlFactory; private String queryString; /** * Creates an empty MostUsedValues. * * @param service any GoogleBaseService used to retrieve attribute histograms * @param urlFactory a FeedURLFactory used to create the URLs of * the attribute histograms * @param queryString the query string used to filter the analyzed items, * for example one that focuses only on the items that * have a specific item type. */ public MostUsedValues(GoogleBaseService service, FeedURLFactory urlFactory, String queryString) { this.service = service; this.urlFactory = urlFactory; this.queryString = queryString; mostUsedValues = new Hashtable<String, String[]>(); timers = new ArrayList<Timer>(); } /** * Gets the cached most used values of an attribute. * * @param attrName the name of the attribute * @return a cached list of the most used values */ public String[] getMostUsedValuesForAttribute(String attrName) { return mostUsedValues.get(attrName); } /** * Sets up the object to cache a limited number of the most used values of * each of the specified attributes; the cache is refreshed periodically. * * This method does not check if the specified attributes are already cached. * You have to make sure an attribute is specified only once in your calls. * * @param interval the cache refresh period, in millis * @param maxValues the maximum number of values to cache * @param servletContext a ServletContext to be used for logging * @param attrNames the names of the attributes to be cached */ synchronized public void cache(long interval, final int maxValues, ServletContext servletContext, final String... attrNames) { if (attrNames.length == 0) { return; } Timer timer = new Timer(true); TimerTask task = createRefresher(maxValues, servletContext, attrNames); task.run(); timer.schedule(task, interval, interval); timers.add(timer); } /** * Creates a TimerTask that refreshes the cached most used values of * the specified attributes. * * @param maxValues how many values to cache for each attribute * @param servletContext servlet context for logging error messages * @param attrNames the names of the attributes * @return a TimerTask that refreshes the cache */ private TimerTask createRefresher(final int maxValues, final ServletContext servletContext, final String... attrNames) { TimerTask task = new TimerTask() { /** * Tells the MostUsedValues object that created this TimerTask to * refresh the cached most used values of some of the attributes. */ @Override public void run() { try { MostUsedValues.this.retrieveMostUsedValues(maxValues, attrNames); } catch (IOException e) { servletContext.log(e.getMessage(), e); } catch (ServiceException e) { servletContext.log(e.getMessage() + " " + e.getHttpErrorCodeOverride() + " " + e.getResponseContentType() + ": " + e.getResponseBody(), e); } } }; return task; } /** * Retrieves the most used values for some attributes * for the items that match the query string * and stores a limited number of those values for each attribute. * * @param numValue maximum number of values to store * @param attrNames the names of the attributes */ protected void retrieveMostUsedValues(int numValue, final String... attrNames) throws ServiceException, IOException { URL url = urlFactory.getAttributesFeedURL(); GoogleBaseQuery query = new GoogleBaseQuery(url); StringBuffer queryString = createQueryString(attrNames); query.setGoogleBaseQuery(queryString.toString()); query.setMaxValues(numValue); int numResults = 0; int lastNumResults = 0; Collection<String> attrToRetrieve = new ArrayList<String>(Arrays.asList(attrNames)); do { // Get the feed numResults += STEP_MAXRESULTS; query.setMaxResults(numResults); GoogleBaseFeed feed = service.query(query); if (lastNumResults == feed.getTotalResults()) { // No new entries to process break; } lastNumResults = feed.getTotalResults(); // Extract the values from the entries Iterator<String> attrIter = attrToRetrieve.iterator(); while (attrIter.hasNext()) { String attrName = attrIter.next(); // Searching for the entry with the following name String entryTitle = attrName + TEXT_TYPE; for (GoogleBaseEntry entry : feed.getEntries()) { if (entryTitle.equals(entry.getTitle().getPlainText())) { extractValuesFromEntry(numValue, attrName, entry); attrIter.remove(); } } } } while (!attrToRetrieve.isEmpty() && numResults <= MAX_MAXRESULTS); if (!attrToRetrieve.isEmpty()) { throw new ServiceException("The retrieved histograms do not contain" + "some of the attributes. The most used values of these attributes " + "have not been refreshed."); } } /** * Returns the query string extended so that it filters out the items * that do not have at least one of the specified attributes of type TEXT. * * @param attrNames the attributes we are interested in * @return an extended query string */ protected StringBuffer createQueryString(final String... attrNames) { StringBuffer queryString = new StringBuffer(this.queryString); queryString.append("("); queryString.append("[").append(attrNames[0]).append(TEXT_TYPE).append("]"); for (int i = 1; i < attrNames.length; i++) { String attrName = attrNames[i]; queryString.append("|[").append(attrName).append(TEXT_TYPE).append("]"); } queryString.append(")"); return queryString; } /** * Caches a limited number of the values of a GoogleBaseEntry. * * @param numValue maximum number of values to cache * @param attrName the name of the attribute that has the values * @param entry an entry with a MetadataEntryExtension */ private void extractValuesFromEntry(int numValue, String attrName, GoogleBaseEntry entry) { MetadataEntryExtension metadata = entry.getGoogleBaseMetadata(); AttributeHistogram attributeHistogram = metadata.getAttributeHistogram(); List<? extends UniqueValue> values = attributeHistogram.getValues(); int valuesCount = Math.min(numValue, values.size()); String[] usedValues = new String[valuesCount]; for (int i = 0; i < valuesCount; i++) { usedValues[i] = values.get(i).getValueAsString(); } updateMostUsedValue(attrName, usedValues); } /** * Cancels all refresh timers and clears the cache. */ synchronized public void clear() { for (Timer timer : timers) { timer.cancel(); } timers.clear(); mostUsedValues.clear(); } /** * Returns the number of cached attributes. */ public int size() { return mostUsedValues.size(); } /** * Returns true if no attribute is cached. */ public boolean isEmpty() { return mostUsedValues.isEmpty(); } public String getQueryString() { return queryString; } /** * Caches the most used values of an attribute. * @param attrName * @param stringValues */ protected void updateMostUsedValue(String attrName, String[] stringValues) { mostUsedValues.put(attrName, stringValues); } }