/******************************************************************************* * CogTool Copyright Notice and Distribution Terms * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * CogTool is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * CogTool is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with CogTool; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * CogTool makes use of several third-party components, with the * following notices: * * Eclipse SWT version 3.448 * Eclipse GEF Draw2D version 3.2.1 * * Unless otherwise indicated, all Content made available by the Eclipse * Foundation is provided to you under the terms and conditions of the Eclipse * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this * Content and is also available at http://www.eclipse.org/legal/epl-v10.html. * * CLISP version 2.38 * * Copyright (c) Sam Steingold, Bruno Haible 2001-2006 * This software is distributed under the terms of the FSF Gnu Public License. * See COPYRIGHT file in clisp installation folder for more information. * * ACT-R 6.0 * * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & * John R Anderson. * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * Apache Jakarta Commons-Lang 2.1 * * This product contains software developed by the Apache Software Foundation * (http://www.apache.org/) * * jopt-simple version 1.0 * * Copyright (c) 2004-2013 Paul R. Holser, Jr. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Mozilla XULRunner 1.9.0.5 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/. * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The J2SE(TM) Java Runtime Environment version 5.0 * * Copyright 2009 Sun Microsystems, Inc., 4150 * Network Circle, Santa Clara, California 95054, U.S.A. All * rights reserved. U.S. * See the LICENSE file in the jre folder for more information. ******************************************************************************/ package edu.cmu.cs.hcii.cogtool.model; import java.util.List; import org.apache.commons.lang.builder.HashCodeBuilder; import edu.cmu.cs.hcii.cogtool.CogTool; import edu.cmu.cs.hcii.cogtool.CogToolPref; import edu.cmu.cs.hcii.cogtool.util.ObjectLoader; import edu.cmu.cs.hcii.cogtool.util.ObjectSaver; //The names of classes and interfaces around this are terrible, but we can't //change them without breaking old .cgt files, since our persistence //mechanism leaks the implementation detail of our class names into the //abstraction of our file format. //Note that there //is no such thing as an un-cached TermSimilarity. //Both GoogleSimilarity and CachedGoogleSimilarity are direct subsclasses //of CachedTermSimilarity, and the one with cached in its name does not //inherit from the one without. All very confusing. //TODO once we augment our persistence mechanism in such a way that we // can actually rename persistable classes, we should tidy up these names. /** * Corresponds to computeGSimilarity in Leonghwee's code. * But in the UI it's called PMI-G (word). * It also differs from Leongwhee's code in that we now normalize the results. */ public class CachedGoogleSimilarity extends CachedTermSimilarity implements ISitedTermSimilarity { public static final int edu_cmu_cs_hcii_cogtool_model_CachedGoogleSimilarity_version = 0; protected static final String contextSiteVAR = "contextSite"; private static ObjectSaver.IDataSaver<CachedGoogleSimilarity> SAVER = new ObjectSaver.ADataSaver<CachedGoogleSimilarity>() { @Override public int getVersion() { return edu_cmu_cs_hcii_cogtool_model_CachedGoogleSimilarity_version; } @Override public void saveData(CachedGoogleSimilarity v, ObjectSaver saver) throws java.io.IOException { saver.saveObject(v.site, contextSiteVAR); } }; public static void registerSaver() { ObjectSaver.registerSaver(CachedGoogleSimilarity.class.getName(), SAVER); } private static ObjectLoader.IObjectLoader<CachedGoogleSimilarity> LOADER = new ObjectLoader.AObjectLoader<CachedGoogleSimilarity>() { @Override public CachedGoogleSimilarity createObject() { return new CachedGoogleSimilarity(); } @Override public void set(CachedGoogleSimilarity target, String variable, Object value) { if (variable != null) { if (variable.equals(contextSiteVAR)) { target.site = (String) value; } } } }; public static void registerLoader() { ObjectLoader.registerLoader(CachedGoogleSimilarity.class.getName(), edu_cmu_cs_hcii_cogtool_model_CachedGoogleSimilarity_version, LOADER); } protected String site = null; public static final double PMI_G_SIZE_AUTOMATIC = -1.0; protected static CachedGoogleSimilarity checkAlg = null; public static CachedGoogleSimilarity create(String limitToSite) { if (checkAlg == null) { checkAlg = new CachedGoogleSimilarity(); } checkAlg.resetSite(limitToSite); CachedGoogleSimilarity registered = (CachedGoogleSimilarity) AlgorithmRegistry.ONLY.register(checkAlg); // If null, then the registry used checkAlg and registered it. if (registered == null) { registered = checkAlg; checkAlg = null; } // registered now contains the algorithm to use return registered; } protected CachedGoogleSimilarity() { } /** * This algorithm also allows one to provide a document set scope * by limiting the fetches to a specified site. */ protected void resetSite(String limitToSite) { if ("".equals(limitToSite)) { limitToSite = null; } site = limitToSite; } @Override protected IWordFrequencyParser getWordFreqParser(String word, List<String> errors) { GoogleSimilarity.termFreqParser.reset(word, errors, site); return GoogleSimilarity.termFreqParser; } // getWordFreqParser protected static final String COMMON_WORD = "the"; /** * No need for a similarity URL processor; word similarity is computed * purely using word frequencies. */ @Override protected Double fetchWordSimilarity(String goalWord, String searchWord, List<String> errors) { double indexSize = CogToolPref.PMI_G_SIZE.getDouble(); if (indexSize == CachedGoogleSimilarity.PMI_G_SIZE_AUTOMATIC) { indexSize = getWordFrequency(COMMON_WORD, errors); } double numerator = (getWordFrequency(goalWord + " " + searchWord, errors)) / indexSize; double denominator = ((getWordFrequency(goalWord, errors)) / indexSize) * ((getWordFrequency(searchWord, errors)) / indexSize); // TODO we may want to normalize the results differently if (denominator != 0.0) { double unnormalized = Math.log10(numerator / denominator); double normalized = unnormalized / Math.log10(indexSize); CogTool.logger.finer(String.format( "Normalizing PMI-G (word) value (%s, %s) from %g to %g", goalWord, searchWord, unnormalized, normalized)); return new Double(normalized); } CogTool.logger.finer(String.format( "Unknown PMI-G (word) similarity for %s, %s", goalWord, searchWord)); return UNKNOWN_SIMILARITY; } @Override protected ISimilarityParser getSimilarityParser(String goal, String search, List<String> errors) { throw new UnsupportedOperationException("No similarity parser is required for CachedGoogle for: " + goal + ", " + search); } public String getContextSite() { return site; } @Override public boolean equals(Object other) { if (other instanceof CachedGoogleSimilarity) { CachedGoogleSimilarity cgs = (CachedGoogleSimilarity) other; if (site == null) { return cgs.site == null; } return site.equals(cgs.site); } return false; } @Override public int hashCode() { // Must have a unique ODD number for each class which uses // hashCodeBuilder. // this : 139, 207 HashCodeBuilder hcb = new HashCodeBuilder(139, 207); if (site != null) { hcb.append(site.hashCode()); } return hcb.toHashCode(); } public ITermSimilarity duplicate() { CachedGoogleSimilarity cgs = new CachedGoogleSimilarity(); cgs.site = site; return cgs; } }