/** * This file is part of General Entity Annotator Benchmark. * * General Entity Annotator Benchmark is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * General Entity Annotator Benchmark is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>. */ package org.aksw.gerbil.dataset.impl.erd; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.aksw.gerbil.datatypes.ErrorTypes; import org.aksw.gerbil.exceptions.GerbilException; import org.aksw.gerbil.transfer.nif.Document; import org.aksw.gerbil.transfer.nif.Marking; import org.aksw.gerbil.transfer.nif.data.DocumentImpl; import org.aksw.gerbil.transfer.nif.data.NamedEntity; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.math.RandomUtils; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.CoreMatchers.nullValue; import static org.hamcrest.CoreMatchers.notNullValue; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertThat; import org.junit.BeforeClass; import org.junit.Test; @Deprecated public class ERDDatasetTest { private static final String FREEBASE_URI = "https://www.googleapis.com/freebase"; private static final String ERD_DATASET_PATH = "gerbil_data/datasets/erd2014/"; private static final String TEXT_FILE = "Trec_beta.query.txt"; private static final String ANNOTATION_FILE = "Trec_beta.annotation.txt"; private static List<Document> EXPECTED_DOCUMENTS; private static List<Document> LOADED_DOCUMENTS; private static List<String> DOCUMENT_URI; @BeforeClass public static void prepareResourcesToTest() throws GerbilException { DOCUMENT_URI = new ArrayList<String>(); DOCUMENT_URI.add("http://ERD-Test/Trec_beta.query.txt"); loadExpectedSet(); loadDatasets(); // generateTerminalOutputForLoadedErdDatasets(); } @Test public void checkTrecData() { int min = 5; int max = 10; List<ERDTrec> treclist = new ArrayList<>(); List<Integer> linelist = new ArrayList<>(); List<String> textlist = new ArrayList<>(); List<String> second_phrase_text = new ArrayList<>(); assertThat(treclist.size(), is(0)); int lineColumnCount = 0; ERDTrec dtrec = null; int randomtrecs = (int)(Math.random() * max) + min; for (int i = 0; i < randomtrecs; i++) { String id = "TREC-" + i; int randomTextpart = (int)(Math.random() * 5) + 2; String text = ""; for (int j = 0; j < randomTextpart; j++){ String randomText = RandomStringUtils.randomAlphanumeric(RandomUtils.nextInt(max) + min); text = text + randomText + " "; if (j==1) second_phrase_text.add(randomText); } text = id + "\t" + text.substring(0,text.length()-1); textlist.add(text); dtrec = new ERDTrec(text, dtrec); treclist.add(dtrec); lineColumnCount = lineColumnCount + text.length() + 1; linelist.add(lineColumnCount - 1); } for (ERDTrec trec : treclist) assertThat(trec, is(notNullValue())); for (int i = 0; i < treclist.size(); i++) { assertThat(treclist.get(i).getColumnCount(), is(linelist.get(i))); assertThat(treclist.get(i).getLineNumber(), is(i)); assertThat(treclist.get(i).getLine().equals(textlist.get(i)), is(true)); assertThat((treclist.get(i).getTextPosition(second_phrase_text.get(i)) > 0), is(true)); } } @Test public void checkLoadDatasets() throws GerbilException { assertThat(LOADED_DOCUMENTS.size(), is(1)); int countmarkings = 0; for (Document tmp : LOADED_DOCUMENTS){ countmarkings += tmp.getMarkings().size(); } assertThat(countmarkings, is(59)); } @Test public void checkExpectedDataset() { assertThat(EXPECTED_DOCUMENTS.size(), is(1)); int countmarkings = 0; for (Document tmp : EXPECTED_DOCUMENTS){ countmarkings += tmp.getMarkings().size(); } assertThat(countmarkings, is(16)); } @Test public void checkExpectedDatasetIsSubsetOfLoadedDataset() throws GerbilException { for (int i = 0; i < EXPECTED_DOCUMENTS.size(); i++){ for (int j = 0; j < EXPECTED_DOCUMENTS.get(i).getMarkings().size(); j++){ String ld_mark = LOADED_DOCUMENTS.get(i).getMarkings().get(j).toString(); ld_mark = ld_mark.substring(1, ld_mark.length()-1); String[] ld_parts = ld_mark.split(" "); assertThat(ld_parts.length, is(3)); String ld_start = ld_parts[0].substring(0, ld_parts[0].length()-1); String ld_length = ld_parts[1].substring(0, ld_parts[1].length()-1); String ld_uri = ld_parts[2].substring(1 + FREEBASE_URI.length(), ld_parts[2].length()-1); ld_uri = ld_uri.replaceAll("_", " "); String ex_mark = EXPECTED_DOCUMENTS.get(i).getMarkings().get(j).toString(); ex_mark = ex_mark.substring(1, ex_mark.length()-1); String[] ex_parts = ex_mark.split(" "); assertThat(ex_parts.length, is(3)); String ex_start = ex_parts[0].substring(0, ex_parts[0].length()-1); String ex_length = ex_parts[1].substring(0, ex_parts[1].length()-1); String ex_uri = ex_parts[2].substring(1 + FREEBASE_URI.length(), ex_parts[2].length()-1); ex_uri = ex_uri.replaceAll("_", " "); assertEquals(ld_start, ex_start); assertEquals(ld_length, ex_length); assertEquals(ld_uri, ex_uri); } } } @Test public void checkLoadedDatasetFindInDatasetFiles() throws GerbilException { String text = getString(ERD_DATASET_PATH + ANNOTATION_FILE); for (int i = 0; i < LOADED_DOCUMENTS.size(); i++){ for (int j = 0; j < LOADED_DOCUMENTS.get(i).getMarkings().size(); j++){ String mark = LOADED_DOCUMENTS.get(i).getMarkings().get(j).toString(); mark = mark.substring(1, mark.length()-1); String[] parts = mark.split(" "); assertThat(parts.length, is(3)); String start = parts[0].substring(0, parts[0].length()-1); String length = parts[1].substring(0, parts[1].length()-1); String uri = parts[2].substring(1 + FREEBASE_URI.length(), parts[2].length()-1); List<String> searchString = new ArrayList<>(); int pos = -1; while ((pos = text.indexOf(uri, pos + 1)) != -1) { int point = pos + uri.length() + 1; searchString.add(text.substring(point, point+Integer.valueOf(length))); } String match = returnStringPositionInFile(ERD_DATASET_PATH + TEXT_FILE, Integer.valueOf(start), Integer.valueOf(length)); assertThat((searchString.contains(match)), is(true)); } } } @SuppressWarnings("resource") private static void loadDatasets() throws GerbilException { assertThat(LOADED_DOCUMENTS, is(nullValue())); LOADED_DOCUMENTS = new ArrayList<>(); assertThat(LOADED_DOCUMENTS, is(notNullValue())); assertThat(LOADED_DOCUMENTS.size(), is(0)); ERDDataset dataset = new ERDDataset(ERD_DATASET_PATH + TEXT_FILE, ERD_DATASET_PATH + ANNOTATION_FILE); dataset.setName("Erd-Test"); dataset.init(); LOADED_DOCUMENTS.addAll(dataset.getInstances()); } private static void loadExpectedSet() { assertThat(EXPECTED_DOCUMENTS, is(nullValue())); EXPECTED_DOCUMENTS = new ArrayList<>(); assertThat(EXPECTED_DOCUMENTS, is(notNullValue())); assertThat(EXPECTED_DOCUMENTS.size(), is(0)); List<String> text = new ArrayList<>(); List<List<Marking>> markings = new ArrayList<>(); text.add("..TREC-1.adobe indian houses..TREC-2.atypical squamous cells..TREC-3.battles in the civil war..TREC-4.becoming a paralegal..TREC-5.best long term care insurance..TREC-6.blue throated hummingbird..TREC-7.bowflex power pro..TREC-8.brooks brothers clearance..TREC-9.butter and margarine..TREC-10.california franchise tax board..TREC-11.cass county missouri..TREC-12.civil right movement..TREC-13.condos in florida..TREC-14.culpeper national cemetery..TREC-15.dangers of asbestos..TREC-16.designer dog breeds..TREC-17.discovery channel store..TREC-18.dog clean up bags..TREC-19.dogs for adoption..TREC-20.dutchess county tourism..TREC-21.earn money at home..TREC-22.east ridge high school..TREC-23.electronic skeet shoot..TREC-24.equal opportunity employer..TREC-25.er tv show..TREC-26.fact on uranus..TREC-27.fickle creek farm..TREC-28.french lick resort and casino..TREC-29.furniture for small spaces..TREC-30.gmat prep classes..TREC-31.gs pay rate..TREC-32.how to build a fence..TREC-33.hp mini 2140..TREC-34.illinois state tax..TREC-35.income tax return online..TREC-36.indiana child support.."); markings.add(Arrays.asList( (Marking) new NamedEntity(203, 7, "https://www.googleapis.com/freebase/m/04cnvy"), (Marking) new NamedEntity(229, 15, "https://www.googleapis.com/freebase/m/03d452"), (Marking) new NamedEntity(333, 20, "https://www.googleapis.com/freebase/m/0nfgq"), (Marking) new NamedEntity(393, 5, "https://www.googleapis.com/freebase/m/020ys5"), (Marking) new NamedEntity(403, 7, "https://www.googleapis.com/freebase/m/02xry"), (Marking) new NamedEntity(420, 26, "https://www.googleapis.com/freebase/m/0c4tkd"), (Marking) new NamedEntity(601, 15, "https://www.googleapis.com/freebase/m/0dc3_"), (Marking) new NamedEntity(662, 22, "https://www.googleapis.com/freebase/m/03ck4lv"), (Marking) new NamedEntity(662, 22, "https://www.googleapis.com/freebase/m/027311j"), (Marking) new NamedEntity(662, 22, "https://www.googleapis.com/freebase/m/0bs8gsb"), (Marking) new NamedEntity(762, 2, "https://www.googleapis.com/freebase/m/0180mw"), (Marking) new NamedEntity(833, 29, "https://www.googleapis.com/freebase/m/02761b3"), (Marking) new NamedEntity(872, 9, "https://www.googleapis.com/freebase/m/0c_jw"), (Marking) new NamedEntity(913, 4, "https://www.googleapis.com/freebase/m/065y10k"), (Marking) new NamedEntity(1008, 14, "https://www.googleapis.com/freebase/m/03v0t"), (Marking) new NamedEntity(1070, 7, "https://www.googleapis.com/freebase/m/03v1s") )); EXPECTED_DOCUMENTS = new ArrayList<>(); for (int i = 0; i < 1; i++){ EXPECTED_DOCUMENTS.add(new DocumentImpl(text.get(i), DOCUMENT_URI.get(i), markings.get(i))); } } private String getString(String filePath) throws GerbilException { RandomAccessFile raf; String out = ""; try { File file = new File(filePath); byte[] filedata = new byte[(int) file.length()]; raf = new RandomAccessFile(file, "r"); raf.readFully(filedata); out = new String(filedata); raf.close(); } catch (IOException e) { throw new GerbilException("Exception while reading annotation file of dataset.", e, ErrorTypes.ANNOTATOR_LOADING_ERROR); } return out; } private String returnStringPositionInFile(String filePath, int position, int length) throws GerbilException { RandomAccessFile raf; String out = ""; try { File file = new File(filePath); byte[] search = new byte[length]; raf = new RandomAccessFile(file, "r"); raf.seek(position); raf.readFully(search); raf.close(); out = new String(search); } catch (IOException e) { throw new GerbilException("Exception while reading text file of dataset.", e, ErrorTypes.ANNOTATOR_LOADING_ERROR); } return out; } // private static void generateTerminalOutputForLoadedErdDatasets() throws GerbilException { // // System.out.println("========================================================="); // System.out.println("===================== Documents [" + LOADED_DOCUMENTS.size() + "] ====================="); // for (int i = 0; i < LOADED_DOCUMENTS.size(); i++){ // Document doc = LOADED_DOCUMENTS.get(i); // System.out.println("========================================================="); // System.out.println("Document-URI: " + doc.getDocumentURI()); // System.out.println("==================== Markings [" + doc.getMarkings().size() + "] ===================="); // for (Marking mark : doc.getMarkings()){ // System.out.println(mark.toString()); // } // } // System.out.println("========================================================="); // // } }