/**
* Copyright 2014 Marco Cornolti
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package it.acubelab.smaph.learn;
import it.unipi.di.acube.batframework.data.Tag;
import it.unipi.di.acube.batframework.datasetPlugins.ERD2014Dataset;
import it.unipi.di.acube.batframework.datasetPlugins.SMAPHDataset;
import it.unipi.di.acube.batframework.datasetPlugins.YahooWebscopeL24Dataset;
import it.unipi.di.acube.batframework.problems.C2WDataset;
import it.unipi.di.acube.batframework.systemPlugins.WATAnnotator;
import it.unipi.di.acube.batframework.utils.FreebaseApi;
import it.unipi.di.acube.batframework.utils.WikipediaApiInterface;
import it.acubelab.smaph.SmaphAnnotator;
import it.acubelab.smaph.boldfilters.EditDistanceBoldFilter;
import it.acubelab.smaph.boldfilters.FrequencyBoldFilter;
import it.acubelab.smaph.entityfilters.NoEntityFilter;
import it.acubelab.smaph.linkback.DummyLinkBack;
import it.acubelab.smaph.main.ERDDatasetFilter;
import it.cnr.isti.hpc.erd.WikipediaToFreebase;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashSet;
import java.util.Locale;
import java.util.Vector;
public class GenerateTrainingAndTest {
public static void gatherExamples(SmaphAnnotator bingAnnotator,
C2WDataset ds, BinaryExampleGatherer entityFilterGatherer,
WikipediaToFreebase wikiToFreeb) throws Exception {
for (int i = 0; i < ds.getSize(); i++) {
String query = ds.getTextInstanceList().get(i);
HashSet<Tag> goldStandard = ds.getC2WGoldStandardList().get(i);
Vector<double[]> posEF = new Vector<>();
Vector<double[]> negEF = new Vector<>();
bingAnnotator.generateExamples(query, goldStandard, posEF, negEF,
true, wikiToFreeb);
entityFilterGatherer.addExample(posEF, negEF);
}
}
public static void gatherExamplesTrainingAndDevel(
SmaphAnnotator bingAnnotator,
BinaryExampleGatherer trainEntityFilterGatherer,
BinaryExampleGatherer develEntityFilterGatherer,
WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase,
FreebaseApi freebApi) throws Exception {
if (trainEntityFilterGatherer != null) {
{
C2WDataset smaphTrain = new ERDDatasetFilter(new SMAPHDataset(
"datasets/smaph/smaph_training.xml", wikiApi), wikiApi,
wikiToFreebase);
gatherExamples(bingAnnotator, smaphTrain,
trainEntityFilterGatherer, wikiToFreebase);
}
{
C2WDataset smaphTest = new ERDDatasetFilter(new SMAPHDataset(
"datasets/smaph/smaph_test.xml", wikiApi), wikiApi,
wikiToFreebase);
gatherExamples(bingAnnotator, smaphTest,
trainEntityFilterGatherer, wikiToFreebase);
}
{
C2WDataset smaphDevel = new ERDDatasetFilter(new SMAPHDataset(
"datasets/smaph/smaph_devel.xml", wikiApi), wikiApi,
wikiToFreebase);
gatherExamples(bingAnnotator, smaphDevel,
trainEntityFilterGatherer, wikiToFreebase);
}
{
C2WDataset yahoo = new ERDDatasetFilter(
new YahooWebscopeL24Dataset(
"datasets/yahoo_webscope_L24/ydata-search-query-log-to-entities-v1_0.xml"),
wikiApi, wikiToFreebase);
gatherExamples(bingAnnotator, yahoo, trainEntityFilterGatherer,
wikiToFreebase);
}
{
C2WDataset erd = new ERDDatasetFilter(new ERD2014Dataset(
"datasets/erd2014/Trec_beta.query.txt",
"datasets/erd2014/Trec_beta.annotation.txt", freebApi,
wikiApi), wikiApi, wikiToFreebase);
gatherExamples(bingAnnotator, erd, trainEntityFilterGatherer,
wikiToFreebase);
}
}
if (develEntityFilterGatherer != null) {
C2WDataset develDs = new ERDDatasetFilter(new ERD2014Dataset(
"datasets/erd2014/Trec_beta.query.txt",
"datasets/erd2014/Trec_beta.annotation.txt", freebApi,
wikiApi), wikiApi, wikiToFreebase);
for (Tag t : develDs.getC2WGoldStandardList().get(
develDs.getC2WGoldStandardList().size() - 1))
System.out.println(t.getConcept());
gatherExamples(bingAnnotator, develDs, develEntityFilterGatherer,
wikiToFreebase);
}
SmaphAnnotator.flush();
wikiApi.flush();
}
public static SmaphAnnotator getDefaultBingAnnotator(
WikipediaApiInterface wikiApi, WikipediaToFreebase wikiToFreebase,
double editDistanceSpotFilterThreshold, int wikiSearchTopK,
String bingKey) throws FileNotFoundException,
ClassNotFoundException, IOException {
WATAnnotator wikiSense = new WATAnnotator("wikisense.mkapp.it", 80,
"base", "COMMONNESS", "jaccard", "0.6", "0.0"/* minlp */, false,
false, false);
SmaphAnnotator bingAnnotator = new SmaphAnnotator(wikiSense,
new FrequencyBoldFilter((float)editDistanceSpotFilterThreshold),
new NoEntityFilter(), new DummyLinkBack(), true, true, true,
wikiSearchTopK, false, 0, false, 0, wikiApi, bingKey);
return bingAnnotator;
}
}