/*******************************************************************************
* CogTool Copyright Notice and Distribution Terms
* CogTool 1.3, Copyright (c) 2005-2013 Carnegie Mellon University
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* CogTool is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* (at your option) any later version.
*
* CogTool is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with CogTool; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* CogTool makes use of several third-party components, with the
* following notices:
*
* Eclipse SWT version 3.448
* Eclipse GEF Draw2D version 3.2.1
*
* Unless otherwise indicated, all Content made available by the Eclipse
* Foundation is provided to you under the terms and conditions of the Eclipse
* Public License Version 1.0 ("EPL"). A copy of the EPL is provided with this
* Content and is also available at http://www.eclipse.org/legal/epl-v10.html.
*
* CLISP version 2.38
*
* Copyright (c) Sam Steingold, Bruno Haible 2001-2006
* This software is distributed under the terms of the FSF Gnu Public License.
* See COPYRIGHT file in clisp installation folder for more information.
*
* ACT-R 6.0
*
* Copyright (c) 1998-2007 Dan Bothell, Mike Byrne, Christian Lebiere &
* John R Anderson.
* This software is distributed under the terms of the FSF Lesser
* Gnu Public License (see LGPL.txt).
*
* Apache Jakarta Commons-Lang 2.1
*
* This product contains software developed by the Apache Software Foundation
* (http://www.apache.org/)
*
* jopt-simple version 1.0
*
* Copyright (c) 2004-2013 Paul R. Holser, Jr.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* Mozilla XULRunner 1.9.0.5
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/.
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
* License for the specific language governing rights and limitations
* under the License.
*
* The J2SE(TM) Java Runtime Environment version 5.0
*
* Copyright 2009 Sun Microsystems, Inc., 4150
* Network Circle, Santa Clara, California 95054, U.S.A. All
* rights reserved. U.S.
* See the LICENSE file in the jre folder for more information.
******************************************************************************/
package edu.cmu.cs.hcii.cogtool.model;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.builder.HashCodeBuilder;
import edu.cmu.cs.hcii.cogtool.util.FetchURLUtil;
import edu.cmu.cs.hcii.cogtool.util.ObjectLoader;
import edu.cmu.cs.hcii.cogtool.util.ObjectSaver;
//The names of classes and interfaces around this are terrible, but we can't
//change them without breaking old .cgt files, since our persistence
//mechanism leaks the implementation detail of our class names into the
//abstraction of our file format.
//Note that the only implementer of this interface is the abstract calls
//CachedTermSimilarity. All concrete classes are subclasses of that, and there
//is no such thing as an un-cached TermSimilarity.
//Both GoogleSimilarity and CachedGoogleSimilarity are direct subsclasses
//of CachedTermSimilarity, and the one with cached in its name does not
//inherit from the one without. All very confusing.
//TODO once we augment our persistence mechanism in such a way that we
// can actually rename persistable classes, we should tidy up these names.
public class LSASimilarity extends CachedTermSimilarity
{
public static final int edu_cmu_cs_hcii_cogtool_model_LSASimilarity_version = 1;
protected static final String spaceVAR = "space";
protected static final String urlVAR = "url";
private static ObjectSaver.IDataSaver<LSASimilarity> SAVER =
new ObjectSaver.ADataSaver<LSASimilarity>() {
@Override
public int getVersion()
{
return edu_cmu_cs_hcii_cogtool_model_LSASimilarity_version;
}
@Override
public void saveData(LSASimilarity value, ObjectSaver saver)
throws java.io.IOException
{
saver.saveObject(value.space, spaceVAR);
saver.saveObject(value.url, urlVAR);
}
};
public static void registerSaver()
{
ObjectSaver.registerSaver(LSASimilarity.class.getName(),
SAVER);
}
private static ObjectLoader.IObjectLoader<LSASimilarity> LOADER =
new ObjectLoader.AObjectLoader<LSASimilarity>() {
@Override
public LSASimilarity createObject()
{
return new LSASimilarity();
}
@Override
public void set(LSASimilarity target,
String variable,
Object value)
{
if (variable != null) {
if (variable.equals(spaceVAR)) {
target.space = (String) value;
}
else if (variable.equals(urlVAR)) {
target.url = (String) value;
}
}
}
};
private static ObjectLoader.IObjectLoader<LSASimilarity> LOADER_v0 =
new ObjectLoader.AObjectLoader<LSASimilarity>() {
@Override
public LSASimilarity createObject()
{
return new LSASimilarity();
}
};
public static void registerLoader()
{
ObjectLoader.registerLoader(LSASimilarity.class.getName(),
0,
LOADER_v0);
ObjectLoader.registerLoader(LSASimilarity.class.getName(),
edu_cmu_cs_hcii_cogtool_model_LSASimilarity_version,
LOADER);
}
/**
* Default space
*/
public static final String DEFAULT_SPACE =
"General_Reading_Up_to_1st_Year_College";
/**
* Known "spaces"
*/
public static final String[] KNOWN_SPACES =
{ DEFAULT_SPACE,
"General_Reading_Up_to_12th_Grade",
"General_Reading_Up_to_9th_Grade",
"General_Reading_Up_to_6th_Grade",
"General_Reading_Up_to_3rd_Grade",
"French-Monde-1993",
"CDUskills_v1",
"ExpertPilot_v1",
"CDUskills_v2",
"CDUskills_v3",
"CDUskills_v4",
"CDUskills_v5",
"ExpertPilot_v2",
"ExpertPilot_v3",
"ExpertPilot_v4",
"ExpertPilot_v5",
"CDUskills",
"ExpertPilot"
};
/**
* If url is null, then use the following constant.
*/
public final static String DEFAULT_LSA_URL =
"http://autocww2.colorado.edu/cgi-bin/nph-elaborate.cgi?Frequency=50&Cosine=0.5&";
//"http://aviationknowledge.colorado.edu/cgi-bin/nph-elaborate.cgi?Frequency=50&Cosine=0.5&";
protected String space = DEFAULT_SPACE;
protected String url = null;
@Override
protected IWordFrequencyParser getWordFreqParser(String word,
List<String> errors)
{
// don't need a word parser for this algorithm
return null;
}
// TODO we need to make the LSA parsing less fragile in the face of
// changes to the format returned
private static Pattern NEW_SERVER_PATTERN =
Pattern.compile("<TD>\\s*Cosine\\s*</TD>.*<TD>\\s*(-?\\d+\\.\\d+)\\s*\\($|<BR>\\s*</TD>\\)");
private static Pattern ALTERNATIVE_RESULT_PATTERN =
Pattern.compile("</TD>\\s*<TD>\\s*(-?\\d+\\.\\d+)\\s*$");
protected class ProcessGoalSimilarity extends FetchURLUtil.AURLProcessor
implements ISimilarityParser
{
protected String goalWord = null;
protected String searchWord = null;
protected String urlSafeGoal = null;
protected String urlSafeSearch = null;
protected double similarity = UNKNOWN;
public String getURL()
{
String result =
((url != null) ? url : DEFAULT_LSA_URL)
+ "Space=" + space
+ "&Goal=" + urlSafeGoal
+ "&Links=" + urlSafeSearch;
return result;
}
/**
* Possible responses (tokens separated by white space):
*
* BOTH WORDS UNKNOWN:
* or
* ONE WORD UNKNOWN:
* The string "Can't find any terms from text :" in the source
*
* BOTH WORDS KNOWN:
* Similarity value appears on a line by itself in a table
* following the line "</TD> <TD>"
*
* ALSO NOW:
* for a new instance of the server there's a new HTML format
* returned so there's a different possibility, too; sigh
*
*/
public boolean process(BufferedReader rdr)
{
List<String> errors = getErrors();
try {
String inputLine;
boolean parseNext = false;
while ((inputLine = rdr.readLine()) != null) {
if (inputLine.indexOf("Can't find any terms from text :") != -1)
{
// Similarity is unknown
return true;
}
Matcher m = NEW_SERVER_PATTERN.matcher(inputLine);
if (m.find()) {
similarity = Double.parseDouble(m.group(1));
return true;
}
if (parseNext) {
similarity = Double.parseDouble(inputLine);
return true;
}
m = ALTERNATIVE_RESULT_PATTERN.matcher(inputLine);
if (m.find()) {
similarity = Double.parseDouble(m.group(1));
return true;
}
if ("</TD> <TD>".equals(inputLine)) {
parseNext = true;
}
}
}
catch (IOException ex) {
ex.printStackTrace();
if (errors != null) {
errors.add("Reader readLine error trying to fetch from LSA:GetSimilarity for: "
+ goalWord + ", "
+ searchWord);
}
}
catch (NumberFormatException ex) {
ex.printStackTrace();
if (errors != null) {
errors.add("Reader readLine error trying to parse the similarity "
+ ex);
}
}
return false;
} // process
public void reset(String goal, String search, List<String> errors)
{
reset(goal, search, errors, retryCount);
}
/**
* To allow an instance of this processor to act as a "singleton",
* the code that wishes to fetch a word-pair similarity using LSA@PARC
* should reset the parameters used for the fetch.
*
* The given words will be used in the URL to LSA@PARC.
*/
public void reset(String goal,
String search,
List<String> errors,
int useRetryCount)
{
goalWord = goal;
searchWord = search;
similarity = UNKNOWN;
try {
urlSafeGoal = URLEncoder.encode(goalWord, "UTF-8");
}
catch (UnsupportedEncodingException ex) {
throw new IllegalStateException("Encoding failed for word: "
+ goalWord);
}
try {
urlSafeSearch =
URLEncoder.encode(searchWord, "UTF-8");
}
catch (UnsupportedEncodingException ex) {
throw new IllegalStateException("Encoding failed for word: "
+ searchWord);
}
reset(errors, useRetryCount);
}
/**
* Return the goal word last processed (or attempted).
*/
public String getGoalWord()
{
return goalWord;
}
/**
* Return the search word last processed (or attempted).
*/
public String getSearchWord()
{
return searchWord;
}
/**
* Return the similarity last fetched.
*/
public double getSimilarity()
{
return similarity;
}
}
protected ProcessGoalSimilarity goalSimilarityParser = null;
protected LSASimilarity()
{
// For create and loading
}
protected static LSASimilarity checkAlg = null;
public static LSASimilarity create()
{
return create(null, null);
}
public static LSASimilarity create(String useSpace)
{
return create(useSpace, null);
}
public static LSASimilarity create(String useSpace, String useURL)
{
if ((useSpace == null) || "".equals(useSpace)) {
useSpace = DEFAULT_SPACE;
}
if ("".equals(useURL) || DEFAULT_LSA_URL.equals(useURL)) {
useURL = null;
}
if (checkAlg == null) {
checkAlg = new LSASimilarity();
}
checkAlg.space = useSpace;
checkAlg.url = useURL;
LSASimilarity registered =
(LSASimilarity) AlgorithmRegistry.ONLY.register(checkAlg);
// If null, then the registry used checkAlg and registered it.
if (registered == null) {
registered = checkAlg;
checkAlg = null;
}
// registered now contains the algorithm to use
return registered;
}
@Override
protected ISimilarityParser getSimilarityParser(String goal,
String search,
List<String> errors)
{
if (goalSimilarityParser == null) {
goalSimilarityParser = new ProcessGoalSimilarity();
}
goalSimilarityParser.reset(goal, search, errors);
return goalSimilarityParser;
}
@Override
public double determineSimilarity(String goal,
String search,
List<String> errors,
ITermSimilarity.Continuable cont)
{
return getWordSimilarity(goal, search, errors);
}
public String getSpace()
{
return space;
}
public void setSpace(String newSpace)
{
if ((newSpace == null) || "".equals(newSpace)) {
newSpace = DEFAULT_SPACE;
}
space = newSpace;
}
// Default URL may change from when the object was saved.
public String getURL()
{
return (url == null) ? DEFAULT_LSA_URL : url;
}
public void setURL(String newURL)
{
if ("".equals(newURL) || DEFAULT_LSA_URL.equals(newURL)) {
newURL = null;
}
url = newURL;
}
@Override
public boolean equals(Object other)
{
if (other instanceof LSASimilarity) {
LSASimilarity otherLSA = (LSASimilarity) other;
if (space.equals(otherLSA.space)) {
if (url == null) {
return otherLSA.url == null;
}
return url.equals(otherLSA.url);
}
}
return false;
}
@Override
public int hashCode()
{
// Must have a unique ODD number for each class which uses
// hashCodeBuilder.
// this : 531, 447
HashCodeBuilder hcb = new HashCodeBuilder(531, 447);
hcb.append(space.hashCode());
if (url != null) {
hcb.append(url.hashCode());
}
return hcb.toHashCode();
}
public ITermSimilarity duplicate()
{
return this;
}
}