package com.constellio.app.modules.es.connectors.http;
import static com.constellio.app.modules.es.sdk.TestConnectorEvent.ADD_EVENT;
import static com.constellio.app.modules.es.sdk.TestConnectorEvent.MODIFY_EVENT;
import static com.constellio.model.entities.schemas.MetadataValueType.STRING;
import static com.constellio.model.entities.schemas.Schemas.IDENTIFIER;
import static com.constellio.model.services.search.query.logical.LogicalSearchQueryOperators.where;
import static java.util.Arrays.asList;
import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.data.MapEntry.entry;
import static org.assertj.core.groups.Tuple.tuple;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.eclipse.jetty.server.Server;
import org.joda.time.DateTimeConstants;
import org.joda.time.LocalDateTime;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import com.constellio.app.modules.es.connectors.http.utils.NtlmAuthenticationFilter;
import com.constellio.app.modules.es.connectors.http.utils.WebsitesUtils;
import com.constellio.app.modules.es.connectors.spi.ConnectorLogger;
import com.constellio.app.modules.es.connectors.spi.ConsoleConnectorLogger;
import com.constellio.app.modules.es.model.connectors.AuthenticationScheme;
import com.constellio.app.modules.es.model.connectors.http.ConnectorHttpDocument;
import com.constellio.app.modules.es.model.connectors.http.ConnectorHttpInstance;
import com.constellio.app.modules.es.model.connectors.structures.TraversalSchedule;
import com.constellio.app.modules.es.sdk.TestConnectorEventObserver;
import com.constellio.app.modules.es.services.ConnectorManager;
import com.constellio.app.modules.es.services.ESSchemasRecordsServices;
import com.constellio.app.modules.es.services.crawler.ConnectorCrawler;
import com.constellio.app.modules.es.services.crawler.DefaultConnectorEventObserver;
import com.constellio.app.modules.es.services.mapping.ConnectorField;
import com.constellio.app.modules.es.services.mapping.ConnectorMappingService;
import com.constellio.app.modules.es.services.mapping.TargetParams;
import com.constellio.data.utils.TimeProvider;
import com.constellio.model.entities.schemas.Metadata;
import com.constellio.model.services.records.RecordServices;
import com.constellio.sdk.tests.ConstellioTest;
import com.constellio.sdk.tests.annotations.InternetTest;
@InternetTest
public class ConnectorHttpAcceptanceTest extends ConstellioTest {
Server server;
String htmlMimetype = "text/html";
String txtMimetype = "text/plain";
String pdfMimetype = "application/pdf";
LocalDateTime TIME1 = new LocalDateTime();
LocalDateTime ONE_MINUTE_AFTER_TIME1 = TIME1.plusMinutes(1);
LocalDateTime TWO_MINUTES_AFTER_TIME1 = TIME1.plusMinutes(2);
LocalDateTime TWO_WEEKS_AFTER_TIME1 = TIME1.plusDays(14);
LocalDateTime FOUR_WEEKS_AFTER_TIME1 = TIME1.plusDays(28);
private static String WEBSITE = "http://localhost:4242/";
ConnectorManager connectorManager;
RecordServices recordServices;
ESSchemasRecordsServices es;
ConnectorHttpInstance connectorInstance;
ConnectorLogger logger = new ConsoleConnectorLogger();
private String zeMimetypeCode = "zeMimetype";
private List<ConnectorHttpDocument> connectorDocuments;
private TestConnectorEventObserver eventObserver;
@Before
public void setUp()
throws Exception {
prepareSystem(withZeCollection().withConstellioESModule().withAllTestUsers());
es = new ESSchemasRecordsServices(zeCollection, getAppLayerFactory());
recordServices = getModelLayerFactory().newRecordServices();
connectorManager = es.getConnectorManager();
eventObserver = new TestConnectorEventObserver(es, new DefaultConnectorEventObserver(es, logger, "crawlerObserver"));
connectorManager.setCrawler(ConnectorCrawler.runningJobsSequentially(es, eventObserver).withoutSleeps());
givenTimeIs(TIME1);
}
@Test
public void whenModifyingSeedsAndInclusionsDuringExecutionThenApplied()
throws Exception {
givenTestWebsiteInState1();
connectorInstance = connectorManager.createConnector(es.newConnectorHttpInstance().setCode("zeConnector")
.setTitle("Ze connector").setEnabled(true).setSeeds("http://www.perdu.com"));
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "level").containsOnly(
tuple("http://www.perdu.com", true, true, 0)
);
recordServices.update(connectorInstance.setSeeds(WEBSITE + "index.html").setIncludePatterns(WEBSITE));
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "level").containsOnly(
tuple("http://www.perdu.com", true, true, 0),
tuple(WEBSITE + "index.html", true, true, 0),
tuple(WEBSITE + "singes.html", false, false, 1),
tuple(WEBSITE + "girafe.html", false, false, 1),
tuple(WEBSITE + "elephant.html", false, false, 1)
);
}
@Test
public void whenIndexingAPdfThenSaveFetchedDocument()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
// *
// * ----------------- Fetch phase 1 --------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", false, null, false, 1),
tuple(WEBSITE + "girafe.html", false, null, false, 1),
tuple(WEBSITE + "elephant.html", false, null, false, 1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(ADD_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(ADD_EVENT, WEBSITE + "singes.html"),
tuple(ADD_EVENT, WEBSITE + "girafe.html"),
tuple(ADD_EVENT, WEBSITE + "elephant.html")
);
// *
// * ----------------- Fetch phase 2 --------------
// *
givenTimeIs(ONE_MINUTE_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "licornes.html", false, null, false, 2),
tuple(WEBSITE + "singes/gorille.html", false, null, false, 2),
tuple(WEBSITE + "singes/macaque.html", false, null, false, 2)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(ADD_EVENT, WEBSITE + "licornes.html"),
tuple(ADD_EVENT, WEBSITE + "singes/gorille.html"),
tuple(ADD_EVENT, WEBSITE + "singes/macaque.html")
);
// *
// * ---------------- Fetch phase 3 ---------------
// *
givenTimeIs(TWO_MINUTES_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1, true, 0),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1, true, 0),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1, true, 0),
tuple(WEBSITE + "licornes.html", true, TWO_MINUTES_AFTER_TIME1, false, 1),
tuple(WEBSITE + "singes/gorille.html", true, TWO_MINUTES_AFTER_TIME1, true, 0),
tuple(WEBSITE + "singes/macaque.html", true, TWO_MINUTES_AFTER_TIME1, true, 0)
);
ConnectorHttpDocument licornes = es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html");
assertThat(licornes.getErrorCode()).isEqualTo("404");
assertThat(licornes.getErrorMessage()).isEqualTo("Not Found");
assertThat(licornes.getErrorStackTrace()).isNull();
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html")
);
// *
// * ---------------- Nothing to fetch ---------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(eventObserver.newEvents()).extracting("eventType", "url").isEmpty();
// *
// * ---------------- Refetching everything two weeks later ---------------
// *
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "girafe.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "elephant.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "licornes.html", true, TWO_WEEKS_AFTER_TIME1, false, 2),
tuple(WEBSITE + "singes/gorille.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "singes/macaque.html", true, TWO_WEEKS_AFTER_TIME1, true, 0)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html")
);
}
@Test
public void whenIndexingAWebsiteThenSaveFetchedAndUnfetchedDocuments()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
// *
// * ----------------- Fetch phase 1 --------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", false, null, false, 1),
tuple(WEBSITE + "girafe.html", false, null, false, 1),
tuple(WEBSITE + "elephant.html", false, null, false, 1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(ADD_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(ADD_EVENT, WEBSITE + "singes.html"),
tuple(ADD_EVENT, WEBSITE + "girafe.html"),
tuple(ADD_EVENT, WEBSITE + "elephant.html")
);
// *
// * ----------------- Fetch phase 2 --------------
// *
givenTimeIs(ONE_MINUTE_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "licornes.html", false, null, false, 2),
tuple(WEBSITE + "singes/gorille.html", false, null, false, 2),
tuple(WEBSITE + "singes/macaque.html", false, null, false, 2)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(ADD_EVENT, WEBSITE + "licornes.html"),
tuple(ADD_EVENT, WEBSITE + "singes/gorille.html"),
tuple(ADD_EVENT, WEBSITE + "singes/macaque.html")
);
// *
// * ---------------- Fetch phase 3 ---------------
// *
givenTimeIs(TWO_MINUTES_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1, true, 0),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1, true, 0),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1, true, 0),
tuple(WEBSITE + "licornes.html", true, TWO_MINUTES_AFTER_TIME1, false, 1),
tuple(WEBSITE + "singes/gorille.html", true, TWO_MINUTES_AFTER_TIME1, true, 0),
tuple(WEBSITE + "singes/macaque.html", true, TWO_MINUTES_AFTER_TIME1, true, 0)
);
ConnectorHttpDocument licornes = es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html");
assertThat(licornes.getErrorCode()).isEqualTo("404");
assertThat(licornes.getErrorMessage()).isEqualTo("Not Found");
assertThat(licornes.getErrorStackTrace()).isNull();
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html")
);
// *
// * ---------------- Nothing to fetch ---------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(eventObserver.newEvents()).extracting("eventType", "url").isEmpty();
// *
// * ---------------- Refetching everything two weeks later ---------------
// *
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "girafe.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "elephant.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "licornes.html", true, TWO_WEEKS_AFTER_TIME1, false, 2),
tuple(WEBSITE + "singes/gorille.html", true, TWO_WEEKS_AFTER_TIME1, true, 0),
tuple(WEBSITE + "singes/macaque.html", true, TWO_WEEKS_AFTER_TIME1, true, 0)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html")
);
}
@Test
public void whenOnDemandWebsitesAreSpecifiedThenPriorizedAndRemovedFromOnDemandList()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
recordServices.update(connectorInstance.setOnDemands(
WEBSITE + "singes/gorille.html\n" +
WEBSITE + "girafe.html"));
// *
// * ----------------- Fetch phase 1 with two on demand url--------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1),
tuple(WEBSITE + "singes/gorille.html", true, TIME1),
tuple(WEBSITE + "girafe.html", true, TIME1),
tuple(WEBSITE + "singes.html", false, null),
tuple(WEBSITE + "licornes.html", false, null),
tuple(WEBSITE + "elephant.html", false, null),
tuple(WEBSITE + "singes/macaque.html", false, null)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(ADD_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(ADD_EVENT, WEBSITE + "singes/gorille.html"),
tuple(ADD_EVENT, WEBSITE + "girafe.html"),
tuple(ADD_EVENT, WEBSITE + "singes.html"),
tuple(ADD_EVENT, WEBSITE + "licornes.html"),
tuple(ADD_EVENT, WEBSITE + "elephant.html"),
tuple(ADD_EVENT, WEBSITE + "singes/macaque.html")
);
recordServices.refresh(connectorInstance);
assertThat(connectorInstance.getOnDemands()).isNull();
// *
// * ----------------- Fetch phase 2 with two on demand (an already fetched and a new one) --------------
// *
recordServices.update(connectorInstance.setOnDemands(
WEBSITE + "singes/gorille.html\n" +
WEBSITE + "yeti.html"));
givenTimeIs(ONE_MINUTE_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "girafe.html", true, TIME1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "yeti.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "licornes.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "singes/gorille.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "singes/macaque.html", true, ONE_MINUTE_AFTER_TIME1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(ADD_EVENT, WEBSITE + "yeti.html")
);
recordServices.refresh(connectorInstance);
assertThat(connectorInstance.getOnDemands()).isNull();
}
@Test
public void givenConnectorIsStoppedThenResumeCorrectly()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
// *
// * ----------------- Fetch phase 1 --------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1),
tuple(WEBSITE + "singes.html", false, null),
tuple(WEBSITE + "girafe.html", false, null),
tuple(WEBSITE + "elephant.html", false, null)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(ADD_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(ADD_EVENT, WEBSITE + "singes.html"),
tuple(ADD_EVENT, WEBSITE + "girafe.html"),
tuple(ADD_EVENT, WEBSITE + "elephant.html")
);
// *
// * ----------------- Connector is disabled - nothing is fetched --------------
// *
recordServices.update(connectorInstance.setEnabled(false));
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1),
tuple(WEBSITE + "singes.html", false, null),
tuple(WEBSITE + "girafe.html", false, null),
tuple(WEBSITE + "elephant.html", false, null)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").isEmpty();
// *
// * ----------------- Fetch phase 2 --------------
// *
recordServices.update(connectorInstance.setEnabled(true));
givenTimeIs(ONE_MINUTE_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "licornes.html", false, null),
tuple(WEBSITE + "singes/gorille.html", false, null),
tuple(WEBSITE + "singes/macaque.html", false, null)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(ADD_EVENT, WEBSITE + "licornes.html"),
tuple(ADD_EVENT, WEBSITE + "singes/gorille.html"),
tuple(ADD_EVENT, WEBSITE + "singes/macaque.html")
);
// *
// * ---------------- Fetch phase 3 ---------------
// *
givenTimeIs(TWO_MINUTES_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1),
tuple(WEBSITE + "licornes.html", true, TWO_MINUTES_AFTER_TIME1),
tuple(WEBSITE + "singes/gorille.html", true, TWO_MINUTES_AFTER_TIME1),
tuple(WEBSITE + "singes/macaque.html", true, TWO_MINUTES_AFTER_TIME1)
);
ConnectorHttpDocument licornes = es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html");
assertThat(licornes.getErrorCode()).isEqualTo("404");
assertThat(licornes.getErrorMessage()).isEqualTo("Not Found");
assertThat(licornes.getErrorStackTrace()).isNull();
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html")
);
}
@Test
public void givenWebSiteIsModifiedThenUpdatedCorrectly()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
fullyFetchWebsite();
verifyWebsiteInVersion1IsCorrectlyFetched();
// *
// * ---------------- Refetching everything two weeks later ---------------
// *
givenTestWebsiteInState2();
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes-wiki.pdf", false, null),
tuple(WEBSITE + "singes.txt", false, null),
tuple(WEBSITE + "girafe.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "paresseux.html", false, null),
tuple(WEBSITE + "elephant.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "licornes.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes/gorille.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes/macaque.html", true, TWO_WEEKS_AFTER_TIME1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(ADD_EVENT, WEBSITE + "paresseux.html"),
tuple(ADD_EVENT, WEBSITE + "singes-wiki.pdf"),
tuple(ADD_EVENT, WEBSITE + "singes.txt"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html"),
tuple(MODIFY_EVENT, WEBSITE + "licornes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/gorille.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes/macaque.html")
);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "girafe.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes-wiki.pdf", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes.txt", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "paresseux.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "elephant.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "licornes.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes/gorille.html", true, TWO_WEEKS_AFTER_TIME1),
tuple(WEBSITE + "singes/macaque.html", true, TWO_WEEKS_AFTER_TIME1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "paresseux.html"),
tuple(MODIFY_EVENT, WEBSITE + "singes-wiki.pdf"),
tuple(MODIFY_EVENT, WEBSITE + "singes.txt")
);
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html")).isNotNull();
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "girafe.html")).isNotNull();
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.html").getParsedContent()).contains("sympathique");
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "singes-wiki.pdf").getParsedContent()).contains("Simiiformes");
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.txt").getParsedContent()).contains("Linux");
}
@Test
public void givenWebSitePagesAreNotAccessibleThenErrorCodesButStillSearchable()
throws Exception {
givenTimeIs(TIME1);
givenTestWebsiteInState2();
givenDataSet1Connector();
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "errorCode", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, true, null, 0),
tuple(WEBSITE + "singes.html", false, false, null, 0),
tuple(WEBSITE + "paresseux.html", false, false, null, 0),
tuple(WEBSITE + "elephant.html", false, false, null, 0)
);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "errorCode", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, true, null, 0),
tuple(WEBSITE + "singes.html", true, true, null, 0),
tuple(WEBSITE + "singes-wiki.pdf", false, false, null, 0),
tuple(WEBSITE + "singes.txt", false, false, null, 0),
tuple(WEBSITE + "paresseux.html", true, true, null, 0),
tuple(WEBSITE + "licornes.html", false, false, null, 0),
tuple(WEBSITE + "elephant.html", true, true, null, 0),
tuple(WEBSITE + "singes/gorille.html", false, false, null, 0),
tuple(WEBSITE + "singes/macaque.html", false, false, null, 0)
);
stopWebsiteServer();
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "errorCode", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, true, null, 0),
tuple(WEBSITE + "singes.html", true, true, null, 0),
tuple(WEBSITE + "singes-wiki.pdf", true, false, "io exception", 1),
tuple(WEBSITE + "singes.txt", true, false, "io exception", 1),
tuple(WEBSITE + "paresseux.html", true, true, null, 0),
tuple(WEBSITE + "licornes.html", true, false, "io exception", 1),
tuple(WEBSITE + "elephant.html", true, true, null, 0),
tuple(WEBSITE + "singes/gorille.html", true, false, "io exception", 1),
tuple(WEBSITE + "singes/macaque.html", true, false, "io exception", 1)
);
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
givenTestWebsiteInState2();
connectorDocuments = fullyFetchWebsite();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "errorCode", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, true, null, 0),
tuple(WEBSITE + "singes.html", true, true, null, 0),
tuple(WEBSITE + "singes-wiki.pdf", true, true, null, 0),
tuple(WEBSITE + "singes.txt", true, true, null, 0),
tuple(WEBSITE + "paresseux.html", true, true, null, 0),
tuple(WEBSITE + "licornes.html", true, false, "404", 1),
tuple(WEBSITE + "elephant.html", true, true, null, 0),
tuple(WEBSITE + "singes/gorille.html", true, true, null, 0),
tuple(WEBSITE + "singes/macaque.html", true, true, null, 0)
);
stopWebsiteServer();
givenTimeIs(FOUR_WEEKS_AFTER_TIME1);
connectorDocuments = fullyFetchWebsite();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "errorCode", "errorsCount").containsOnly(
tuple(WEBSITE + "index.html", true, true, "io exception", 1),
tuple(WEBSITE + "singes.html", true, true, "io exception", 1),
tuple(WEBSITE + "singes-wiki.pdf", true, true, "io exception", 1),
tuple(WEBSITE + "singes.txt", true, true, "io exception", 1),
tuple(WEBSITE + "paresseux.html", true, true, "io exception", 1),
tuple(WEBSITE + "licornes.html", true, false, "io exception", 1),
tuple(WEBSITE + "elephant.html", true, true, "io exception", 1),
tuple(WEBSITE + "singes/gorille.html", true, true, "io exception", 1),
tuple(WEBSITE + "singes/macaque.html", true, true, "io exception", 1)
);
}
private void verifyWebsiteInVersion1IsCorrectlyFetched() {
assertThat(connectorDocuments).extracting("URL", "fetched").containsOnly(
tuple(WEBSITE + "index.html", true),
tuple(WEBSITE + "singes.html", true),
tuple(WEBSITE + "girafe.html", true),
tuple(WEBSITE + "elephant.html", true),
tuple(WEBSITE + "licornes.html", true),
tuple(WEBSITE + "singes/gorille.html", true),
tuple(WEBSITE + "singes/macaque.html", true)
);
// assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
// tuple(WEBSITE + "index.html", true, TIME1),
// tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1),
// tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1),
// tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1),
// tuple(WEBSITE + "licornes.html", true, TWO_MINUTES_AFTER_TIME1),
// tuple(WEBSITE + "singes/gorille.html", true, TWO_MINUTES_AFTER_TIME1),
// tuple(WEBSITE + "singes/macaque.html", true, TWO_MINUTES_AFTER_TIME1)
// );
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html")).isNotNull();
}
private void verifyWebsiteInVersion1IsNotCorrectlyFetched() {
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1)
);
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html")).isNull();
}
private void verifyWebsiteInVersion5IsCorrectlyFetched() {
assertThat(connectorDocuments).extracting("URL", "fetched").containsOnly(
tuple(WEBSITE + "index.html", true),
tuple(WEBSITE + "girafe.png", true),
tuple(WEBSITE + "girafe_corrupt.png", true),
tuple(WEBSITE + "empty.html", false)
);
}
private List<ConnectorHttpDocument> fullyFetchWebsite() {
// *
// * ----------------- Fetch phase 1 --------------
// *
boolean newEvents = true;
while (newEvents) {
connectorDocuments = tickAndGetAllDocuments();
newEvents = !eventObserver.newEvents().isEmpty();
givenTimeIs(TimeProvider.getLocalDateTime().plusMinutes(1));
}
return connectorDocuments;
}
@Test
public void whenIndexingAWebSiteWithDuplicatedPagesThenDoNot()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
// *
// * ----------------- Fully fetching website in version 1 --------------
// *
fullyFetchWebsite();
verifyWebsiteInVersion1IsCorrectlyFetched();
ConnectorHttpContext context = loadContext();
assertThat(context.fetchedUrls).containsOnly(
WEBSITE + "index.html",
WEBSITE + "singes.html",
WEBSITE + "girafe.html",
WEBSITE + "elephant.html",
WEBSITE + "licornes.html",
WEBSITE + "singes/gorille.html",
WEBSITE + "singes/macaque.html"
);
assertThat(context.documentUrlsClassifiedByDigests).containsOnly(
entry(INDEX_DIGEST, WEBSITE + "index.html"),
entry(SINGES_DIGEST, WEBSITE + "singes.html"),
entry(GIRAFE_DIGEST, WEBSITE + "girafe.html"),
entry(ELEPHANT_DIGEST, WEBSITE + "elephant.html"),
entry(SINGES_GORILLE_DIGEST, WEBSITE + "singes/gorille.html"),
entry(SINGES_MACAQUE_DIGEST, WEBSITE + "singes/macaque.html")
);
// *
// * ----------------- Fully fetching website in version 3 (with duplicates) --------------
// *
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
givenTestWebsiteInState3WithDuplucates();
fullyFetchWebsite();
context = loadContext();
assertThat(context.fetchedUrls).containsOnly(
WEBSITE + "index.html",
WEBSITE + "singes.html",
WEBSITE + "singes.txt",
WEBSITE + "singes-wiki.pdf",
WEBSITE + "paresseux.html",
WEBSITE + "girafe.html",
WEBSITE + "elephant.html",
WEBSITE + "licornes.html",
WEBSITE + "singes/gorille.html",
WEBSITE + "singes/dk.html",
WEBSITE + "singes/macaque.html",
WEBSITE + "copy/index.html",
WEBSITE + "copy/singes.html",
WEBSITE + "copy/singes.txt",
WEBSITE + "copy/singes-wiki.pdf",
WEBSITE + "copy/paresseux.html",
WEBSITE + "copy/elephant.html",
WEBSITE + "copy/licornes.html",
WEBSITE + "copy/singes/gorille.html",
WEBSITE + "copy/singes/dk.html",
WEBSITE + "copy/singes/macaque.html"
);
assertThat(context.documentUrlsClassifiedByDigests).containsOnly(
entry(INDEX_DIGEST_V3, WEBSITE + "index.html"),
entry(INDEX_COPY_DIGEST_V3, WEBSITE + "copy/index.html"),
entry(SINGES_DIGEST_V3, WEBSITE + "singes.html"),
entry(PARESSEUX_DIGEST_V3, WEBSITE + "paresseux.html"),
entry(ELEPHANT_DIGEST_V3, WEBSITE + "elephant.html"),
entry(SINGES_GORILLE_AND_DK_DIGEST_V3, WEBSITE + "singes/gorille.html"),
entry(SINGES_MACAQUE_DIGEST_V3, WEBSITE + "singes/macaque.html"),
entry(SINGES_PDF_DIGEST_V3, WEBSITE + "singes-wiki.pdf"),
entry(SINGES_TEXT_DIGEST_V3, WEBSITE + "singes.txt"),
entry(GIRAFE_DIGEST, WEBSITE + "girafe.html")
);
ConnectorHttpDocument singe = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.html");
assertThat(singe.isSearchable()).isTrue();
assertThat(singe.getParsedContent()).contains("gorille");
assertThat(singe.getCopyOf()).isNull();
assertThat(singe.getDigest()).contains(SINGES_DIGEST_V3);
// assertThat(singe.getOutlinks()).containsOnly(
// WEBSITE + "elephant.html",
// WEBSITE + "paresseux.html",
// WEBSITE + "singes.txt",
// WEBSITE + "singes-wiki.pdf",
// WEBSITE + "singes/gorille.html",
// WEBSITE + "singes/macaque.html"
// );
ConnectorHttpDocument singeCopy = es.getConnectorHttpDocumentByUrl(WEBSITE + "copy/singes.html");
assertThat(singeCopy.isSearchable()).isFalse();
assertThat(singeCopy.getParsedContent()).isNull();
assertThat(singeCopy.getCopyOf()).isEqualTo(singe.getURL());
assertThat(singeCopy.getDigest()).contains(SINGES_DIGEST_V3);
// assertThat(singeCopy.getOutlinks()).containsOnly(
// WEBSITE + "copy/elephant.html",
// WEBSITE + "copy/paresseux.html",
// WEBSITE + "copy/singes.txt",
// WEBSITE + "copy/singes-wiki.pdf",
// WEBSITE + "copy/singes/gorille.html",
// WEBSITE + "copy/singes/macaque.html"
// );
ConnectorHttpDocument gorille = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes/gorille.html");
assertThat(gorille.isSearchable()).isTrue();
assertThat(gorille.getParsedContent()).contains("gros");
assertThat(gorille.getCopyOf()).isNull();
assertThat(gorille.getDigest()).contains(SINGES_GORILLE_AND_DK_DIGEST_V3);
// assertThat(gorille.getOutlinks()).containsOnly(
// WEBSITE + "singes.html",
// WEBSITE + "singes/dk.html",
// WEBSITE + "singes/macaque.html"
// );
ConnectorHttpDocument dk = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes/dk.html");
assertThat(dk.isSearchable()).isFalse();
assertThat(dk.getParsedContent()).isNull();
assertThat(dk.getCopyOf()).isEqualTo(gorille.getURL());
assertThat(dk.getDigest()).contains(SINGES_GORILLE_AND_DK_DIGEST_V3);
// assertThat(dk.getOutlinks()).containsOnly(
// WEBSITE + "singes.html",
// WEBSITE + "singes/gorille.html",
// WEBSITE + "singes/macaque.html"
// );
ConnectorHttpDocument singeTxt = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.txt");
assertThat(singeTxt.isSearchable()).isTrue();
assertThat(singeTxt.getParsedContent()).contains("consultant");
assertThat(singeTxt.getCopyOf()).isNull();
assertThat(singeTxt.getDigest()).contains(SINGES_TEXT_DIGEST_V3);
assertThat(singeTxt.getOutlinks()).isEmpty();
ConnectorHttpDocument singeTxtCopy = es.getConnectorHttpDocumentByUrl(WEBSITE + "copy/singes.txt");
assertThat(singeTxtCopy.isSearchable()).isFalse();
assertThat(singeTxtCopy.getParsedContent()).isNull();
assertThat(singeTxtCopy.getCopyOf()).isEqualTo(singeTxt.getURL());
assertThat(singeTxtCopy.getDigest()).contains(SINGES_TEXT_DIGEST_V3);
assertThat(singeTxtCopy.getOutlinks()).isEmpty();
ConnectorHttpDocument singePdf = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes-wiki.pdf");
assertThat(singePdf.isSearchable()).isTrue();
assertThat(singePdf.getParsedContent()).contains("Wikimedia");
assertThat(singePdf.getCopyOf()).isNull();
assertThat(singePdf.getDigest()).contains(SINGES_PDF_DIGEST_V3);
assertThat(singePdf.getOutlinks()).isEmpty();
ConnectorHttpDocument singePdfCopy = es.getConnectorHttpDocumentByUrl(WEBSITE + "copy/singes-wiki.pdf");
assertThat(singePdfCopy.isSearchable()).isFalse();
assertThat(singePdfCopy.getParsedContent()).isNull();
assertThat(singePdfCopy.getCopyOf()).isEqualTo(singePdf.getURL());
assertThat(singePdfCopy.getDigest()).contains(SINGES_PDF_DIGEST_V3);
assertThat(singePdfCopy.getOutlinks()).isEmpty();
// *
// * ----------------- Fully fetching website in version 4 (some duplicated pages were modified) --------------
// *
givenTimeIs(FOUR_WEEKS_AFTER_TIME1);
givenTestWebsiteInState4WithDuplicatesModified();
fullyFetchWebsite();
context = loadContext();
assertThat(context.fetchedUrls).containsOnly(
WEBSITE + "index.html",
WEBSITE + "singes.html",
WEBSITE + "singes.txt",
WEBSITE + "singes-wiki.pdf",
WEBSITE + "paresseux.html",
WEBSITE + "girafe.html",
WEBSITE + "elephant.html",
WEBSITE + "licornes.html",
WEBSITE + "singes/gorille.html",
WEBSITE + "singes/dk.html",
WEBSITE + "singes/macaque.html",
WEBSITE + "copy/index.html",
WEBSITE + "copy/singes.html",
WEBSITE + "copy/singes.txt",
WEBSITE + "copy/singes-wiki.pdf",
WEBSITE + "copy/paresseux.html",
WEBSITE + "copy/elephant.html",
WEBSITE + "copy/licornes.html",
WEBSITE + "copy/singes/gorille.html",
WEBSITE + "copy/singes/dk.html",
WEBSITE + "copy/singes/macaque.html"
);
assertThat(context.documentUrlsClassifiedByDigests).containsOnly(
entry(INDEX_DIGEST_V3, WEBSITE + "index.html"),
entry(INDEX_COPY_DIGEST_V3, WEBSITE + "copy/index.html"),
entry(SINGES_DIGEST_V3, WEBSITE + "singes.html"),
entry(PARESSEUX_DIGEST_V3, WEBSITE + "paresseux.html"),
entry(ELEPHANT_DIGEST_V3, WEBSITE + "elephant.html"),
entry(SINGES_GORILLE_DIGEST_V4, WEBSITE + "singes/gorille.html"),
entry(SINGES_DK_DIGEST_V4, WEBSITE + "singes/dk.html"),
entry(SINGES_MACAQUE_DIGEST_V3, WEBSITE + "singes/macaque.html"),
entry(SINGES_PDF_DIGEST_V3, WEBSITE + "singes-wiki.pdf"),
entry(SINGES_TEXT_DIGEST_V4, WEBSITE + "singes.txt"),
entry(SINGES_TEXT_COPY_DIGEST_V4, WEBSITE + "copy/singes.txt"),
entry(GIRAFE_DIGEST, WEBSITE + "girafe.html")
);
singe = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.html");
assertThat(singe.isSearchable()).isTrue();
assertThat(singe.getParsedContent()).contains("gorille");
assertThat(singe.getCopyOf()).isNull();
assertThat(singe.getDigest()).contains(SINGES_DIGEST_V3);
// assertThat(singe.getOutlinks()).containsOnly(
// WEBSITE + "elephant.html",
// WEBSITE + "paresseux.html",
// WEBSITE + "singes.txt",
// WEBSITE + "singes-wiki.pdf",
// WEBSITE + "singes/gorille.html",
// WEBSITE + "singes/macaque.html"
// );
singeCopy = es.getConnectorHttpDocumentByUrl(WEBSITE + "copy/singes.html");
assertThat(singeCopy.isSearchable()).isFalse();
assertThat(singeCopy.getParsedContent()).isNull();
assertThat(singeCopy.getCopyOf()).isEqualTo(singe.getURL());
assertThat(singeCopy.getDigest()).contains(SINGES_DIGEST_V3);
// assertThat(singeCopy.getOutlinks()).containsOnly(
// WEBSITE + "copy/elephant.html",
// WEBSITE + "copy/paresseux.html",
// WEBSITE + "copy/singes.txt",
// WEBSITE + "copy/singes-wiki.pdf",
// WEBSITE + "copy/singes/gorille.html",
// WEBSITE + "copy/singes/macaque.html"
// );
gorille = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes/gorille.html");
assertThat(gorille.isSearchable()).isTrue();
assertThat(gorille.getParsedContent()).contains("gros");
assertThat(gorille.getCopyOf()).isNull();
assertThat(gorille.getDigest()).contains(SINGES_GORILLE_DIGEST_V4);
// assertThat(gorille.getOutlinks()).containsOnly(
// WEBSITE + "singes.html",
// WEBSITE + "singes/dk.html",
// WEBSITE + "singes/macaque.html"
// );
dk = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes/dk.html");
assertThat(dk.isSearchable()).isTrue();
assertThat(dk.getParsedContent()).contains("gros").contains("DK");
assertThat(dk.getCopyOf()).isNull();
assertThat(dk.getDigest()).contains(SINGES_DK_DIGEST_V4);
// assertThat(dk.getOutlinks()).containsOnly(
// WEBSITE + "singes.html",
// WEBSITE + "singes/gorille.html",
// WEBSITE + "singes/macaque.html"
// );
singeTxt = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.txt");
assertThat(singeTxt.isSearchable()).isTrue();
assertThat(singeTxt.getParsedContent()).contains("consultant").contains("$");
assertThat(singeTxt.getCopyOf()).isNull();
assertThat(singeTxt.getDigest()).contains(SINGES_TEXT_DIGEST_V4);
assertThat(singeTxt.getOutlinks()).isEmpty();
singeTxtCopy = es.getConnectorHttpDocumentByUrl(WEBSITE + "copy/singes.txt");
assertThat(singeTxtCopy.isSearchable()).isTrue();
assertThat(singeTxtCopy.getParsedContent()).contains("consultant");
assertThat(singeTxtCopy.getCopyOf()).isNull();
assertThat(singeTxtCopy.getDigest()).contains(SINGES_TEXT_COPY_DIGEST_V4);
assertThat(singeTxtCopy.getOutlinks()).isEmpty();
singePdf = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes-wiki.pdf");
assertThat(singePdf.isSearchable()).isTrue();
assertThat(singePdf.getParsedContent()).contains("Wikimedia");
assertThat(singePdf.getCopyOf()).isNull();
assertThat(singePdf.getDigest()).contains(SINGES_PDF_DIGEST_V3);
assertThat(singePdf.getOutlinks()).isEmpty();
singePdfCopy = es.getConnectorHttpDocumentByUrl(WEBSITE + "copy/singes-wiki.pdf");
assertThat(singePdfCopy.isSearchable()).isFalse();
assertThat(singePdfCopy.getParsedContent()).isNull();
assertThat(singePdfCopy.getCopyOf()).isEqualTo(singePdf.getURL());
assertThat(singePdfCopy.getDigest()).contains(SINGES_PDF_DIGEST_V3);
assertThat(singePdfCopy.getOutlinks()).isEmpty();
}
@Test
public void givenADocumentHasErrorCode404ForAThirdTimeWhenFetchingThenDeleted()
throws Exception {
givenTimeIs(TIME1);
givenDataSet1Connector();
givenTestWebsiteInState1();
fullyFetchWebsite();
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html").getErrorsCount()).isEqualTo(1);
givenTimeIs(TIME1.plusDays(14));
fullyFetchWebsite();
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html").getErrorsCount()).isEqualTo(2);
givenTimeIs(TIME1.plusDays(28));
fullyFetchWebsite();
//DELETED!
assertThat(es.getConnectorHttpDocumentByUrl(WEBSITE + "licornes.html")).isNull();
}
@Test
public void whenIndexingAWebsiteWhenDocumentLevelExceedMaximumThenNotAdded()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
recordServices.update(connectorInstance.setMaxLevel(1));
// *
// * ----------------- Fetch phase 1 --------------
// *
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", false, null, false, 1),
tuple(WEBSITE + "girafe.html", false, null, false, 1),
tuple(WEBSITE + "elephant.html", false, null, false, 1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(ADD_EVENT, WEBSITE + "index.html"),
tuple(MODIFY_EVENT, WEBSITE + "index.html"),
tuple(ADD_EVENT, WEBSITE + "singes.html"),
tuple(ADD_EVENT, WEBSITE + "girafe.html"),
tuple(ADD_EVENT, WEBSITE + "elephant.html")
);
// *
// * ----------------- Fetch phase 2 --------------
// *
givenTimeIs(ONE_MINUTE_AFTER_TIME1);
connectorDocuments = tickAndGetAllDocuments();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1, true, 1)
);
assertThat(eventObserver.newEvents()).extracting("eventType", "url").containsOnly(
tuple(MODIFY_EVENT, WEBSITE + "singes.html"),
tuple(MODIFY_EVENT, WEBSITE + "girafe.html"),
tuple(MODIFY_EVENT, WEBSITE + "elephant.html")
);
fullyFetchWebsite();
assertThat(connectorDocuments).extracting("URL", "fetched", "fetchedDateTime", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, TIME1, true, 0),
tuple(WEBSITE + "singes.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "girafe.html", true, ONE_MINUTE_AFTER_TIME1, true, 1),
tuple(WEBSITE + "elephant.html", true, ONE_MINUTE_AFTER_TIME1, true, 1)
);
ConnectorHttpContext context = loadContext();
assertThat(context.fetchedUrls).containsOnly(
WEBSITE + "index.html",
WEBSITE + "singes.html",
WEBSITE + "elephant.html",
WEBSITE + "girafe.html"
);
recordServices.update(connectorInstance.setMaxLevel(2));
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
fullyFetchWebsite();
assertThat(connectorDocuments).extracting("URL", "fetched", "searchable", "level").containsOnly(
tuple(WEBSITE + "index.html", true, true, 0),
tuple(WEBSITE + "singes.html", true, true, 1),
tuple(WEBSITE + "girafe.html", true, true, 1),
tuple(WEBSITE + "elephant.html", true, true, 1),
tuple(WEBSITE + "licornes.html", true, false, 2),
tuple(WEBSITE + "singes/gorille.html", true, true, 2),
tuple(WEBSITE + "singes/macaque.html", true, true, 2)
);
context = loadContext();
assertThat(context.fetchedUrls).containsOnly(
WEBSITE + "index.html",
WEBSITE + "singes.html",
WEBSITE + "elephant.html",
WEBSITE + "girafe.html",
WEBSITE + "licornes.html",
WEBSITE + "singes/gorille.html",
WEBSITE + "singes/macaque.html"
);
}
@Test
public void givenAppropriateTimeForScheduleThenConnectorCurrentlyRunning()
throws Exception {
connectorInstance = es.newConnectorHttpInstanceWithId("zeConnector").setCode("zeConnector")
.setTitle("Ze connector").setEnabled(false).setSeeds("http://constellio.com");
LocalDateTime shishOClock = new LocalDateTime().withDayOfWeek(DateTimeConstants.WEDNESDAY).withHourOfDay(12)
.withMinuteOfHour(50);
givenTimeIs(shishOClock);
TraversalSchedule schedule1 = new TraversalSchedule(DateTimeConstants.WEDNESDAY, "11:40", "13:30");
connectorInstance.setTraversalSchedule(asList(schedule1));
assertThat(connectorInstance.isCurrentlyRunning()).isTrue();
}
@Test
public void givenFullDailyScheduleThenConnectorCurrentlyRunning()
throws Exception {
connectorInstance = es.newConnectorHttpInstanceWithId("zeConnector").setCode("zeConnector")
.setTitle("Ze connector").setEnabled(false).setSeeds("http://constellio.com");
LocalDateTime shishOClock = new LocalDateTime().withDayOfWeek(DateTimeConstants.WEDNESDAY).withHourOfDay(12)
.withMinuteOfHour(50);
givenTimeIs(shishOClock);
TraversalSchedule schedule1 = new TraversalSchedule(DateTimeConstants.WEDNESDAY, "00:00", "00:00");
connectorInstance.setTraversalSchedule(asList(schedule1));
assertThat(connectorInstance.isCurrentlyRunning()).isTrue();
}
@Test
public void givenTimeAfterScheduleThenConnectorNotCurrentlyRunning()
throws Exception {
connectorInstance = es.newConnectorHttpInstanceWithId("zeConnector").setCode("zeConnector")
.setTitle("Ze connector").setEnabled(false).setSeeds("http://constellio.com");
LocalDateTime shishOClock = new LocalDateTime().withDayOfWeek(DateTimeConstants.WEDNESDAY).withHourOfDay(14)
.withMinuteOfHour(10);
givenTimeIs(shishOClock);
TraversalSchedule schedule1 = new TraversalSchedule(DateTimeConstants.WEDNESDAY, "11:40", "13:30");
connectorInstance.setTraversalSchedule(asList(schedule1));
assertThat(connectorInstance.isCurrentlyRunning()).isFalse();
}
@Test
public void givenTimeBeforeScheduleThenConnectorNotCurrentlyRunning()
throws Exception {
connectorInstance = es.newConnectorHttpInstanceWithId("zeConnector").setCode("zeConnector")
.setTitle("Ze connector").setEnabled(false).setSeeds("http://constellio.com");
LocalDateTime shishOClock = new LocalDateTime().withDayOfWeek(DateTimeConstants.WEDNESDAY).withHourOfDay(11)
.withMinuteOfHour(10);
givenTimeIs(shishOClock);
TraversalSchedule schedule1 = new TraversalSchedule(DateTimeConstants.WEDNESDAY, "11:40", "13:30");
connectorInstance.setTraversalSchedule(asList(schedule1));
assertThat(connectorInstance.isCurrentlyRunning()).isFalse();
}
@Test
public void givenNoScheduleThenConnectorCurrentlyRunning()
throws Exception {
connectorInstance = es.newConnectorHttpInstanceWithId("zeConnector").setCode("zeConnector")
.setTitle("Ze connector").setEnabled(false).setSeeds("http://constellio.com");
LocalDateTime shishOClock = new LocalDateTime().withDayOfWeek(DateTimeConstants.WEDNESDAY).withHourOfDay(11)
.withMinuteOfHour(10);
givenTimeIs(shishOClock);
connectorInstance.setTraversalSchedule(new ArrayList<TraversalSchedule>());
assertThat(connectorInstance.isCurrentlyRunning()).isTrue();
}
@Test
public void givenNullScheduleThenConnectorCurrentlyRunning()
throws Exception {
connectorInstance = es.newConnectorHttpInstanceWithId("zeConnector").setCode("zeConnector")
.setTitle("Ze connector").setEnabled(false).setSeeds("http://constellio.com");
LocalDateTime shishOClock = new LocalDateTime().withDayOfWeek(DateTimeConstants.WEDNESDAY).withHourOfDay(11)
.withMinuteOfHour(10);
givenTimeIs(shishOClock);
assertThat(connectorInstance.isCurrentlyRunning()).isTrue();
}
@Test
public void givenWebSiteIsNtlmWhenAuthenticationThenFetchCorrectly()
throws Exception {
givenTestWebsiteInState1Ntlm();
givenDataSet1ConnectorWithNtlmAuthentication();
fullyFetchWebsite();
verifyWebsiteInVersion1IsCorrectlyFetched();
}
@Test
public void givenWebSiteIsNtlmWhenNoAuthenticationThenFetchFails()
throws Exception {
givenTestWebsiteInState1Ntlm();
givenDataSet1Connector();
fullyFetchWebsite();
verifyWebsiteInVersion1IsNotCorrectlyFetched();
}
@Test
public void givenInvalidAndEmptyContentThenStopsFetching()
throws Exception {
givenTestWebsiteInState5();
givenDataSet1Connector();
fullyFetchWebsite();
verifyWebsiteInVersion5IsCorrectlyFetched();
//assertThat(fullyFetchWebsite()).isEmpty();
}
@Test
public void whenFetchingThenValidMimetypes()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
fullyFetchWebsite();
verifyWebsiteInVersion1IsCorrectlyFetched();
// *
// * ---------------- Refetching everything two weeks later ---------------
// *
givenTestWebsiteInState2();
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
tickAndGetAllDocuments();
tickAndGetAllDocuments();
tickAndGetAllDocuments();
tickAndGetAllDocuments();
ConnectorHttpDocument girafe = es.getConnectorHttpDocumentByUrl(WEBSITE + "girafe.html");
ConnectorHttpDocument singesWikiPdf = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes-wiki.pdf");
ConnectorHttpDocument singesTxt = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.txt");
assertThat(girafe.getMimetype()).isEqualTo(htmlMimetype);
assertThat(girafe.getTitle()).isEqualTo("girafe.html");
assertThat(singesWikiPdf.getMimetype()).isNotNull().isEqualTo(pdfMimetype);
assertThat(singesWikiPdf.getTitle()).isEqualTo("singes-wiki.pdf");
assertThat(singesTxt.getMimetype()).isNotNull().isEqualTo(txtMimetype);
assertThat(singesTxt.getTitle()).isEqualTo("singes.txt");
}
@Test
public void givenMappedPropertiesWhenFetchingThenPersisted()
throws Exception {
givenTestWebsiteInState1();
givenDataSet1Connector();
givenTimeIs(TIME1);
String schemaType = ConnectorHttpDocument.SCHEMA_TYPE;
ConnectorMappingService connectorMappingService = new ConnectorMappingService(es);
Metadata language = connectorMappingService.createTargetMetadata(
connectorInstance, schemaType, new TargetParams("language", "Language", STRING));
Metadata encoding = connectorMappingService.createTargetMetadata(
connectorInstance, schemaType, new TargetParams("encoding", "Encoding", STRING));
Metadata lastModification = connectorMappingService.createTargetMetadata(
connectorInstance, schemaType, new TargetParams("lastModification", "Last modification", STRING));
List<ConnectorField> fields = connectorMappingService.getConnectorFields(connectorInstance, schemaType);
System.out.println(fields);
Map<String, List<String>> mapping = new HashMap<>();
mapping.put(encoding.getLocalCode(), asList("connectorHttpDocument:charset"));
mapping.put(language.getLocalCode(), asList("connectorHttpDocument:language"));
mapping.put(lastModification.getLocalCode(), asList("connectorHttpDocument:lastModification"));
connectorMappingService.setMapping(connectorInstance, schemaType, mapping);
fullyFetchWebsite();
verifyWebsiteInVersion1IsCorrectlyFetched();
// *
// * ---------------- Refetching everything two weeks later ---------------
// *
givenTestWebsiteInState2();
givenTimeIs(TWO_WEEKS_AFTER_TIME1);
tickAndGetAllDocuments();
tickAndGetAllDocuments();
tickAndGetAllDocuments();
tickAndGetAllDocuments();
ConnectorHttpDocument singe = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.html");
ConnectorHttpDocument singesWikiPdf = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes-wiki.pdf");
ConnectorHttpDocument singesTxt = es.getConnectorHttpDocumentByUrl(WEBSITE + "singes.txt");
assertThat(singe.getList(language)).containsOnly("fr");
assertThat(singe.getList(encoding)).containsOnly("ISO-8859-1");
//assertThat(girafe.getList(lastModification)).containsOnly(TIME1);
assertThat(singesWikiPdf.getList(language)).containsOnly("fr");
assertThat(singesWikiPdf.getList(encoding)).containsOnly("ISO-8859-1");
// assertThat(singesWikiPdf.getList(lastModification)).containsOnly(TIME1);
assertThat(singesTxt.getList(language)).containsOnly("fr");
assertThat(singesTxt.getList(encoding)).containsOnly("ISO-8859-1");
// assertThat(singesTxt.getList(lastModification)).containsOnly(TIME1);
}
// ---------------------------------------------------------------
private static final String INDEX_DIGEST = "/Ok4oZZZGe0FeVpxNylB6tur1AY=";
private static final String SINGES_DIGEST = "Ngn1NIMLzej3eAGtQ6FwAlLQLxM=";
private static final String GIRAFE_DIGEST = "S5Mj9KzanPY19YZzxAOinp9kIvk=";
private static final String ELEPHANT_DIGEST = "Vy1aUfAbqgO2WY9zzbLKUVKJEXQ=";
private static final String SINGES_GORILLE_DIGEST = "TUnkhbiXTfUjJd8qMU9BTlhtXQA=";
private static final String SINGES_MACAQUE_DIGEST = "bkNYVzvgX3dxKuC5fOg5504ZsGY=";
private static final String INDEX_DIGEST_V3 = "dOTKmIFwfAJynDrgwRQOtsIPWjo=";
private static final String INDEX_COPY_DIGEST_V3 = "6We6PJ/pwevIIzXKIELZtEYUX4Y=";
private static final String SINGES_DIGEST_V3 = "bYljEwL34ycnskhqnDYbY1GLMDg=";
private static final String SINGES_PDF_DIGEST_V3 = "nkKGz5voDPtR+h510qfkdyTY5mk=";
private static final String SINGES_TEXT_DIGEST_V3 = "qBpsYTysp2dnSNEh2cRhRJSh/3M=";
private static final String PARESSEUX_DIGEST_V3 = "NyjsogOzzRBBU/n30HSG6W6TkuE=";
private static final String ELEPHANT_DIGEST_V3 = "DX9eskZsdNQDUxHwWUsKWT92ujU=";
private static final String SINGES_GORILLE_AND_DK_DIGEST_V3 = "TUnkhbiXTfUjJd8qMU9BTlhtXQA=";
private static final String SINGES_MACAQUE_DIGEST_V3 = "bkNYVzvgX3dxKuC5fOg5504ZsGY=";
private static final String SINGES_GORILLE_DIGEST_V4 = "TUnkhbiXTfUjJd8qMU9BTlhtXQA=";
private static final String SINGES_DK_DIGEST_V4 = "66PqN2bX2oMrXoQxI0aXu3Q0c4U=";
private static final String SINGES_TEXT_DIGEST_V4 = "rV1iSsaGrJgCyrMM0GgfH5zMBHk=";
private static final String SINGES_TEXT_COPY_DIGEST_V4 = "qBpsYTysp2dnSNEh2cRhRJSh/3M=";
private String idOf(String url) {
return es.getConnectorHttpDocumentByUrl(url).getId();
}
private ConnectorHttpContext loadContext() {
return new ConnectorHttpContextServices(es).loadContext(connectorInstance.getId());
}
private void givenTestWebsiteInState1() {
if (server != null) {
try {
server.stop();
server.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
server = WebsitesUtils.startWebsiteInState1();
}
private void givenTestWebsiteInState2() {
if (server != null) {
try {
server.stop();
server.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
server = WebsitesUtils.startWebsiteInState2();
}
private void givenTestWebsiteInState3WithDuplucates() {
if (server != null) {
try {
server.stop();
server.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
server = WebsitesUtils.startWebsiteInState3WithDuplicates();
}
private void givenTestWebsiteInState4WithDuplicatesModified() {
if (server != null) {
try {
server.stop();
server.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
server = WebsitesUtils.startWebsiteInState4WithDuplicatesModified();
}
private void givenTestWebsiteInState1Ntlm() {
if (server != null) {
try {
server.stop();
server.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
server = WebsitesUtils.startWebsiteInState1Ntlm();
}
private void givenTestWebsiteInState5() {
if (server != null) {
try {
server.stop();
server.join();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
server = WebsitesUtils.startWebsiteInState5();
}
private List<ConnectorHttpDocument> tickAndGetAllDocuments() {
connectorManager.getCrawler().crawlNTimes(1);
return connectorDocuments();
}
private List<ConnectorHttpDocument> connectorDocuments() {
return es.searchConnectorHttpDocuments(where(IDENTIFIER).isNotNull());
}
private void givenDataSet1Connector() {
connectorInstance = connectorManager.createConnector(es.newConnectorHttpInstance().setCode("zeConnector")
.setTitle("Ze connector").setEnabled(true).setSeeds(WEBSITE + "index.html").setIncludePatterns(WEBSITE));
}
private void givenDataSet1ConnectorWithNtlmAuthentication() {
connectorInstance = connectorManager.createConnector(es.newConnectorHttpInstance().setCode("zeConnector")
.setTitle("Ze connector").setEnabled(true).setSeeds(WEBSITE + "index.html").setIncludePatterns(WEBSITE)
.setAuthenticationScheme(AuthenticationScheme.NTLM)
.setUsername(NtlmAuthenticationFilter.USER).setPassword("password").setDomain(NtlmAuthenticationFilter.DOMAIN));
}
private void stopWebsiteServer() {
try {
server.stop();
server.join();
server = null;
} catch (Exception e) {
}
}
@After
public void tearDown()
throws Exception {
eventObserver.close();
if (server != null) {
stopWebsiteServer();
}
}
}