package edu.stanford.nlp.semparse.open.dataset.library; import edu.stanford.nlp.semparse.open.dataset.Dataset; class UnaryDatasets { public Dataset getDataset(final String name) { if (name == null) throw new RuntimeException("No dataset specified."); if (name.equals("all")) { return new Dataset() .addTestFromDataset(getDataset("geo")) .addFromDataset(getDataset("academia")) .addFromDataset(getDataset("website")) .addFromDataset(getDataset("stanford")) .addFromDataset(getDataset("route")) .addFromDataset(getDataset("celeb")) .addFromDataset(getDataset("sport")) .addFromDataset(getDataset("leader")) .addFromDataset(getDataset("fiction")); } return new Dataset() { { switch (name) { case "one": E("European countries", L("Greece", "Germany", "Spain", "France", "Estonia", "Romania")); break; case "geo": // Easy examples: every page has roughly, should be easy to generalize E("European countries", L("Greece", "Germany", "Spain", "France", "Estonia", "Romania")); E("Asian countries", L("Japan", "China", "India", "Singapore", "Kyrgyzstan", "Iran")); E("Canada provinces", L("Quebec", "British Columbia", "Ontario", "Saskatchewan")); E("cities in California", L("Los Angeles", "San Jose", "Ontario", "Sacramento", "San Francisco")); E("Hawaii islands", L("Hawaii", "Maui", "Kauai", "Molokai", "Oahu", "Lanai", "Niihua")); E("states of the USA", L("California", "Ohio", "Alaska", "Michigan", "Kansas", "New Jersey", "Arizona")); break; case "academia": E("stanford cs faculty", LN("Percy Liang", "Andrew Ng", "Alex Aiken", "Don Knuth", "Chris Manning")); E("cmu cs faculty", LN("Avrim Blum", "Umut Acar", "Priya Narasimhan", "Mahadev Satyanarayanan")); E("Michael I Jordan students", LN("Percy Liang", "Tommi Jaakkola", "John Duchi")); E("Lillian Lee students", LN("Regina Barzilay", "Chenhao Tan", "Bo Pang", "Rie Johnson")); E("MIT CSAIL professors", LN("Daniel Jackson", "Eric Grimson", "Hal Abelson", "Shafi Goldwasser")); break; case "website": E("online social networks", L("Facebook", "Twitter", "Myspace", "Google+")); E("search engines", L("Google", "Yahoo", "Bing")); E("Chinese web portals", L("Baidu", "Sina", "Sohu")); E("social bookmarking sites", L("Reddit", "StumbleUpon", "Digg", "Delicious")); break; case "stanford": // Web pages are a little harder to parse, // but may be closer to what general people want to know E("Stanford undergraduate residence halls", LN("Branner Hall", "Lagunita Court", "Wilbur Hall")); E("Stanford departments", L("Anesthesia", "Dermatology", "Linguistics", "Geophysics")); E("stores in Stanford Shopping Center", L("Brookstone", "Gap", "Microsoft", "Urban Outfitters")); E("Stanford Marguerite lines", L("Line X", "Line O", "SLAC", "Shopping Express", "Bohannon")); E("dining halls in Stanford", L("Ricker", "Wilbur", "Branner", "Lakeside")); E("libraries in Stanford", L("Green", "Meyer", "Hoover", "East Asia", "Music")); break; case "route": // The gas station question is tricky: what should the answer format be? E("Caltrain stops", L("San Francisco", "Palo Alto", "Mountain View", "Santa Clara", "Millbrae")); E("Boston red line stations", L("Harvard Square", "Kendall", "Broadway", "South Station", "Braintree")); E("Tokyo Metro subway lines", L("Ginza", "Chiyoda", "Hibiya", "Namboku", "Fukutoshin", "Marunouchi")); break; case "celeb": E("Justin Bieber's albums", L("Believe", "Under the Mistletoe", "Never Say Never", "My World 2.0")); E("Shyamalan's movies", L("The Sixth Sense", "Unbrekable", "After Earth", "Signs")); E("Rebecca Black singles", L("Friday", "My Moment", "Person of Interest", "Sing It", "In Your Words")); E("members of The Beetles", LN("John Lennon", "Paul McCartney", "George Harrison", "Ringo Starr")); E("casts of The Room", LN("Tommy Wiseau", "Greg Sestero", "Juliette Danielle", "Philip Haldiman")); break; case "sport": E("world cup champions", L(true, "Brazil", "Spain", "Argentina", "Uruguay", "Italy", "France", "England", "Germany")); E("England football clubs", L("Manchester United", "Liverpool", "Chelsea", "Arsenal", "Manchester City")); E("football teams in California", L("Raiders", "Chargers", "49ers")); E("countries in olympics 2012", L("China", "United States", "Australia", "Azerbaijan", "North Korea")); E("Wimbledon winners in men single", LN("Andy Murray", "Roger Federer", "Rafael Nadal", "Lleyton Hewitt")); break; case "leader": E("world billionaires", LN("Bill Gates", "Warren Buffett", "Larry Page", "Larry Ellison", "Steve Ballmer")); E("united states presidents", LN("George Washington", "Thomas Jefferson", "Abraham Lincoln", "Richard Nixon", "Barack Obama", "Andrew Jackson", "Bill Clinton")); E("united states vice presidents", LN("Joe Biden", "Al Gore", "Nelson Rockefeller", "Dick Cheney", "Aaron Burr")); E("leaders of ussr", LN("Vladimir Lenin", "Joseph Stalin", "Nikita Khrushchev", "Leonid Brezhnev", "Mikhail Gorbachev")); E("provosts of Stanford University", LN("Douglas M. Whitaker", "Gerald J. Lieberman", "Donald Kennedy", "John Etchemendy", "Richard Wall Lyman", "Condoleezza Rice")); break; case "fiction": E("Hogwarts Houses", L(true, "Gryffindor", "Hufflepuff", "Ravenclaw", "Slytherin")); E("main characters of Friends", LN(true, "Rachel Green", "Monica Geller", "Phoebe Buffay", "Joey Tribbiani", "Chandler Bing", "Ross Geller")); E("Twilight Saga books", L(true, "Twilight", "New Moon", "Eclipse", "Breaking Dawn")); E("disney movies", L("Brave", "Alice In Wonderland", "Wall E", "The Jungle Book", "Pinocchio")); break; default: throw new RuntimeException("Unsupported dataset: " + name); } } }; } }