/******************************************************************************* * CogTool Copyright Notice and Distribution Terms * CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * CogTool is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * CogTool is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with CogTool; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * CogTool makes use of several third-party components, with the * following notices: * * Eclipse SWT version 3.448 * Eclipse GEF Draw2D version 3.2.1 * * Unless otherwise indicated, all Content made available by the Eclipse * Foundation is provided to you under the terms and conditions of the Eclipse * Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this * Content and is also available at http://www.eclipse.org/legal/epl-v10.html. * * CLISP version 2.38 * * Copyright (c) Sam Steingold, Bruno Haible 2001-2006 * This software is distributed under the terms of the FSF Gnu Public License. * See COPYRIGHT file in clisp installation folder for more information. * * ACT-R 6.0 * * Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere & * John R Anderson. * This software is distributed under the terms of the FSF Lesser * Gnu Public License (see LGPL.txt). * * Apache Jakarta Commons-Lang 2.1 * * This product contains software developed by the Apache Software Foundation * (http://www.apache.org/) * * jopt-simple version 1.0 * * Copyright (c) 2004-2013 Paul R. Holser, Jr. * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Mozilla XULRunner 1.9.0.5 * * The contents of this file are subject to the Mozilla Public License * Version 1.1 (the "License"); you may not use this file except in * compliance with the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/. * Software distributed under the License is distributed on an "AS IS" * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the * License for the specific language governing rights and limitations * under the License. * * The J2SE(TM) Java Runtime Environment version 5.0 * * Copyright 2009 Sun Microsystems, Inc., 4150 * Network Circle, Santa Clara, California 95054, U.S.A. All * rights reserved. U.S. * See the LICENSE file in the jre folder for more information. ******************************************************************************/ package edu.cmu.cs.hcii.cogtool.model; import java.io.BufferedReader; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.List; import org.apache.commons.lang.builder.HashCodeBuilder; import edu.cmu.cs.hcii.cogtool.CogTool; import edu.cmu.cs.hcii.cogtool.util.FetchURLUtil; import edu.cmu.cs.hcii.cogtool.util.ObjectLoader; import edu.cmu.cs.hcii.cogtool.util.ObjectSaver; //The names of classes and interfaces around this are terrible, but we can't //change them without breaking old .cgt files, since our persistence //mechanism leaks the implementation detail of our class names into the //abstraction of our file format. //Note that there //is no such thing as an un-cached TermSimilarity. //Both GoogleSimilarity and CachedGoogleSimilarity are direct subsclasses //of CachedTermSimilarity, and the one with cached in its name does not //inherit from the one without. All very confusing. //TODO once we augment our persistence mechanism in such a way that we // can actually rename persistable classes, we should tidy up these names. /** * Corresponds to computeGEntireSimilarity in Leonghwee's code. * But in the UI it's called PMI-G (phrase). */ public class GoogleSimilarity extends CachedTermSimilarity implements ISitedTermSimilarity { public static final int edu_cmu_cs_hcii_cogtool_model_GoogleSimilarity_version = 0; protected static final String contextSiteVAR = "contextSite"; private static ObjectSaver.IDataSaver<GoogleSimilarity> SAVER = new ObjectSaver.ADataSaver<GoogleSimilarity>() { @Override public int getVersion() { return edu_cmu_cs_hcii_cogtool_model_GoogleSimilarity_version; } @Override public void saveData(GoogleSimilarity v, ObjectSaver saver) throws java.io.IOException { saver.saveObject(v.site, contextSiteVAR); } }; public static void registerSaver() { ObjectSaver.registerSaver(GoogleSimilarity.class.getName(), SAVER); } private static ObjectLoader.IObjectLoader<GoogleSimilarity> LOADER = new ObjectLoader.AObjectLoader<GoogleSimilarity>() { @Override public GoogleSimilarity createObject() { return new GoogleSimilarity(); } @Override public void set(GoogleSimilarity target, String variable, Object value) { if (variable != null) { if (variable.equals(contextSiteVAR)) { target.site = (String) value; } } } }; public static void registerLoader() { ObjectLoader.registerLoader(GoogleSimilarity.class.getName(), edu_cmu_cs_hcii_cogtool_model_GoogleSimilarity_version, LOADER); } protected String site = null; protected static GoogleSimilarity checkAlg = null; public static GoogleSimilarity create(String limitToSite) { if (checkAlg == null) { checkAlg = new GoogleSimilarity(); } checkAlg.resetSite(limitToSite); GoogleSimilarity registered = (GoogleSimilarity) AlgorithmRegistry.ONLY.register(checkAlg); // If null, then the registry used checkAlg and registered it. if (registered == null) { registered = checkAlg; checkAlg = null; } // registered now contains the algorithm to use return registered; } public GoogleSimilarity() { } public void resetSite(String limitToSite) { if ("".equals(limitToSite)) { limitToSite = null; } site = limitToSite; } /** * Works for terms (i.e., phrases) or words! */ protected static class ProcessTermFrequency extends FetchURLUtil.AURLProcessor implements IWordFrequencyParser { private static final String FETCH_FREQUENCY_URL = "http://cogtool.hcii.cs.cmu.edu/cgi-bin/pmi-g-query?auth=none&q="; private String site = null; private String term = null; private StringBuilder urlSafeTerm = new StringBuilder(); private long frequency = 0; public String getURL() { if (site == null) { return FETCH_FREQUENCY_URL + urlSafeTerm.toString(); } return FETCH_FREQUENCY_URL + urlSafeTerm.toString() + "+site%3A" + site; } public boolean process(BufferedReader rdr) { String inputLine = null; try { inputLine = rdr.readLine(); if (inputLine != null) { inputLine = inputLine.trim(); if (inputLine.equals("")) { frequency = 0; } else { frequency = Long.parseLong(inputLine); } CogTool.logger.fine(String.format( "Fetched frequency for %s, %s: %d", term, site, frequency)); return true; } } catch (IOException ex) { // fall through } catch (NumberFormatException ex) { // fall through } String err = String.format( "Unexpected reply from PMI-G server: %s, %s.", term, inputLine); CogTool.logger.fine(err); List<String> errors = getErrors(); if (errors != null) { errors.add(err); } return true; } public void reset(String forTerm, List<String> errors) { reset(forTerm, errors, null); } public void reset(String forTerm, List<String> errors, String site) { reset(forTerm, errors, site, retryCount); } /** * To allow an instance of this processor to act as a "singleton", * the code that wishes to fetch a frequency using Google should * reset the parameters used for the fetch. * * The given word will be used in the URL to Google. */ public void reset(String forTerm, List<String> errors, String site, int useRetryCount) { term = forTerm.trim(); frequency = 0; urlSafeTerm.delete(0, urlSafeTerm.length()); String[] words = term.split("\\s+"); try { if ((words != null) && (words.length > 0)) { urlSafeTerm.append("%2B"); urlSafeTerm.append(URLEncoder.encode(words[0], "UTF-8")); if (words.length > 1) { for (int i = 1; i < words.length; i++) { urlSafeTerm.append("+%2B"); urlSafeTerm.append(URLEncoder.encode(words[i], "UTF-8")); } } } else { urlSafeTerm.append(URLEncoder.encode(("%2B" + term), "UTF-8")); } } catch (UnsupportedEncodingException ex) { throw new IllegalStateException("Encoding failed for term: " + term); } if ((site != null) && ! site.equals("")) { try { this.site = URLEncoder.encode(site, "UTF-8"); } catch (UnsupportedEncodingException ex) { throw new IllegalStateException("Encoding failed for site: " + site); } } else { this.site = null; } reset(errors, useRetryCount); } /** * Return the term last processed (or attempted). */ public String getTerm() { return term; } // To avoid confusion when dealing with words instead of phrases, // a simple alias. public String getWord() { return getTerm(); } /** * Return the frequency last fetched. */ public long getTermFrequency() { return frequency; } // To avoid confusion when dealing with words instead of phrases, // a simple alias. public long getWordFrequency() { return getTermFrequency(); } } public static ProcessTermFrequency termFreqParser = new ProcessTermFrequency(); protected double getTermFrequency(String term, List<String> errors) { if (term == null) { return 0.0; } termFreqParser.reset(term, errors, site); if (FetchURLUtil.processURL(termFreqParser)) { return termFreqParser.getTermFrequency(); } return 0.0; } /** * No need for a similarity URL processor; term similarity is computed * purely using phrase frequencies. */ @Override protected Double fetchWordSimilarity(String goalTerm, String searchTerm, List<String> errors) { double numerator = getTermFrequency(goalTerm + " " + searchTerm, errors); double denominator = getTermFrequency(searchTerm, errors); if (denominator != 0.0) { // if (numerator > denominator) { // errors.add(String.format( // "GoogleSimilarity confusion: %s; %s; %g; %g", // goalTerm, searchTerm, numerator, denominator)); // return UNKNOWN_SIMILARITY; // } return new Double(numerator / denominator); } return UNKNOWN_SIMILARITY; } /** * We re-use the facilities provided by CachedTermSimilarity (i.e., * the cache data structures) where, instead of words, we use phrases * (i.e., terms). */ @Override public double determineSimilarity(String goalTerm, String searchTerm, List<String> errors, ITermSimilarity.Continuable cont) { return getWordSimilarity(goalTerm, searchTerm, errors); } @Override protected IWordFrequencyParser getWordFreqParser(String word, List<String> errors) { termFreqParser.reset(word, errors); return termFreqParser; } @Override protected ISimilarityParser getSimilarityParser(String goal, String search, List<String> errors) { throw new UnsupportedOperationException("No similarity parser is required for Google for: " + goal + ", " + search); } public String getContextSite() { return site; } @Override public boolean equals(Object other) { if (other instanceof GoogleSimilarity) { GoogleSimilarity cgs = (GoogleSimilarity) other; if (site == null) { return cgs.site == null; } return site.equals(cgs.site); } return false; } @Override public int hashCode() { // Must have a unique ODD number for each class which uses // hashCodeBuilder. // this : 181, 193 HashCodeBuilder hcb = new HashCodeBuilder(181, 193); if (site != null) { hcb.append(site.hashCode()); } return hcb.toHashCode(); } public ITermSimilarity duplicate() { GoogleSimilarity gs = new GoogleSimilarity(); gs.site = site; return gs; } }