package eu.dnetlib.iis.wf.affmatching; import static com.google.common.collect.ImmutableList.of; import static eu.dnetlib.iis.common.utils.AvroTestUtils.createLocalAvroDataStore; import static eu.dnetlib.iis.common.utils.JsonAvroTestUtils.readMultipleJsonDataStores; import static eu.dnetlib.iis.common.utils.JsonTestUtils.readJson; import static eu.dnetlib.iis.common.utils.JsonTestUtils.readMultipleJsons; import static eu.dnetlib.iis.wf.affmatching.match.DocOrgRelationMatcherFactory.createDocOrgRelationMatcher; import static eu.dnetlib.iis.wf.affmatching.match.FirstWordsHashBucketMatcherFactory.createNameFirstWordsHashBucketMatcher; import static eu.dnetlib.iis.wf.affmatching.match.MainSectionHashBucketMatcherFactory.createAlternativeNameMainSectionHashBucketMatcher; import static eu.dnetlib.iis.wf.affmatching.match.MainSectionHashBucketMatcherFactory.createNameMainSectionHashBucketMatcher; import static eu.dnetlib.iis.wf.affmatching.match.MainSectionHashBucketMatcherFactory.createShortNameMainSectionHashBucketMatcher; import static java.util.stream.Collectors.toList; import java.io.File; import java.io.IOException; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.junit.After; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.junit.experimental.categories.Category; import com.google.common.io.Files; import eu.dnetlib.iis.common.IntegrationTest; import eu.dnetlib.iis.importer.schemas.Organization; import eu.dnetlib.iis.importer.schemas.ProjectToOrganization; import eu.dnetlib.iis.metadataextraction.schemas.ExtractedDocumentMetadata; import eu.dnetlib.iis.referenceextraction.project.schemas.DocumentToProject; import eu.dnetlib.iis.wf.affmatching.match.AffOrgMatcher; import eu.dnetlib.iis.wf.affmatching.model.SimpleAffMatchResult; import eu.dnetlib.iis.wf.affmatching.orgalternativenames.AffMatchOrganizationAltNameFiller; import eu.dnetlib.iis.wf.affmatching.orgalternativenames.CsvOrganizationAltNamesDictionaryFactory; import eu.dnetlib.iis.wf.affmatching.orgalternativenames.OrganizationAltNameConst; import eu.dnetlib.iis.wf.affmatching.read.IisAffiliationReader; import eu.dnetlib.iis.wf.affmatching.read.IisOrganizationReader; import eu.dnetlib.iis.wf.affmatching.write.AffMatchResultWriter; import eu.dnetlib.iis.wf.affmatching.write.SimpleAffMatchResultWriter; import scala.Tuple2; /** * Affiliation matching module test that measures quality of matching.<br/> * Tests in this class use alternative {@link AffMatchResultWriter} which does * not loose information about matched affiliation position in document.<br/> * <br/> * Quality of matching is described by four factors:<br/> * <ul> * <li>All matches - percentage of all actual matches to all expected matches</li> * <li>All distinct aff matches - percentage of actual matches with distinct affiliations to expected matches with distinct affiliations</li> * <li>True positives - percentage of returned results that was matched correctly</li> * <li>False positives - percentage of returned results that was matched incorrectly (sums to 100% with true positives)</li> * <ul><br/> * * @author madryk */ @Category(IntegrationTest.class) public class AffMatchingAffOrgQualityTest { private final static boolean PRINT_NOT_MATCHED = true; private final static boolean PRINT_FALSE_POSITIVE_MATCHES = true; private final static String INPUT_DATA_DIR_PATH = "src/test/resources/experimentalData/input"; private AffMatchingService affMatchingService; private static JavaSparkContext sparkContext; private File workingDir; private String inputOrgDirPath; private String inputAffDirPath; private String inputDocProjDirPath; private String inputInferredDocProjDirPath; private float inputDocProjConfidenceThreshold = 0.8f; private String inputProjOrgDirPath; private String outputDirPath; private String outputReportPath; @BeforeClass public static void classSetup() { SparkConf conf = new SparkConf(); conf.setMaster("local"); conf.setAppName(AffMatchingAffOrgQualityTest.class.getName()); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "pl.edu.icm.sparkutils.avro.AvroCompatibleKryoRegistrator"); conf.set("spark.driver.host", "localhost"); sparkContext = new JavaSparkContext(conf); } @Before public void setup() throws IOException { workingDir = Files.createTempDir(); inputOrgDirPath = workingDir + "/affiliation_matching/input/organizations"; inputAffDirPath = workingDir + "/affiliation_matching/input/affiliations"; inputDocProjDirPath = workingDir + "/affiliation_matching/input/doc_proj"; inputInferredDocProjDirPath = workingDir + "/affiliation_matching/input/doc_proj_inferred"; inputProjOrgDirPath = workingDir + "/affiliation_matching/input/proj_org"; outputDirPath = workingDir + "/affiliation_matching/output"; outputReportPath = workingDir + "/affiliation_matching/report"; affMatchingService = createAffMatchingService(); } @After public void cleanup() throws IOException { FileUtils.deleteDirectory(workingDir); } @AfterClass public static void classCleanup() throws IOException { if (sparkContext != null) { sparkContext.close(); } } //------------------------ TESTS -------------------------- @Test public void matchAffiliations_combined_data() throws IOException { // given createInputDataFromJsonFiles( of(INPUT_DATA_DIR_PATH + "/all_organizations.json"), of(INPUT_DATA_DIR_PATH + "/docs_with_aff_real_data.json"), of(INPUT_DATA_DIR_PATH + "/doc_project.json"), of(), of(INPUT_DATA_DIR_PATH + "/org_project.json")); // execute affMatchingService.matchAffiliations(sparkContext, inputAffDirPath, inputOrgDirPath, outputDirPath, outputReportPath); // log System.out.println("\nALL TEST DATA"); readResultsAndPrintQualityRate( of("src/test/resources/experimentalData/expectedOutput/matched_aff.json")); } //------------------------ PRIVATE -------------------------- private void createInputDataFromJsonFiles(List<String> jsonInputOrgPaths, List<String> jsonInputAffPaths, List<String> jsonInputDocProjPaths, List<String> jsonInputInferredDocProjPaths, List<String> jsonInputProjOrgPaths) throws IOException { createLocalAvroDataStore(readMultipleJsonDataStores(jsonInputOrgPaths, Organization.class), inputOrgDirPath, Organization.class); createLocalAvroDataStore(readMultipleJsonDataStores(jsonInputAffPaths, ExtractedDocumentMetadata.class), inputAffDirPath, ExtractedDocumentMetadata.class); createLocalAvroDataStore(readMultipleJsonDataStores(jsonInputDocProjPaths, eu.dnetlib.iis.importer.schemas.DocumentToProject.class), inputDocProjDirPath, eu.dnetlib.iis.importer.schemas.DocumentToProject.class); createLocalAvroDataStore(readMultipleJsonDataStores(jsonInputInferredDocProjPaths, DocumentToProject.class), inputInferredDocProjDirPath, DocumentToProject.class); createLocalAvroDataStore(readMultipleJsonDataStores(jsonInputProjOrgPaths, ProjectToOrganization.class), inputProjOrgDirPath, ProjectToOrganization.class); } private void readResultsAndPrintQualityRate(List<String> expectedResultsJsonPaths) throws IOException { List<SimpleAffMatchResult> actualMatches = readJson(outputDirPath + "/part-00000", SimpleAffMatchResult.class); List<SimpleAffMatchResult> expectedMatches = readMultipleJsons(expectedResultsJsonPaths, SimpleAffMatchResult.class); printTruePositivesFactor(expectedMatches, actualMatches); printFalsePositivesFactor(expectedMatches, actualMatches); if (PRINT_FALSE_POSITIVE_MATCHES) { AffMatchingResultPrinter.printFalsePositives(inputAffDirPath, inputOrgDirPath, expectedMatches, actualMatches); } if (PRINT_NOT_MATCHED) { AffMatchingResultPrinter.printNotMatched(inputAffDirPath, inputOrgDirPath, expectedMatches, actualMatches); } } private void printTruePositivesFactor(List<SimpleAffMatchResult> expectedMatches, List<SimpleAffMatchResult> actualMatches) { List<SimpleAffMatchResult> truePositives = actualMatches.stream() .filter(x -> expectedMatches.contains(x)) .collect(toList()); int distinctAffActualMatchesCount = actualMatches.stream() .collect(Collectors.groupingBy(x -> new Tuple2<>(x.getDocumentId(), x.getAffiliationPosition()))) .size(); int distinctAffExpectedMatchesCount = expectedMatches.stream() .collect(Collectors.groupingBy(x -> new Tuple2<>(x.getDocumentId(), x.getAffiliationPosition()))) .size(); printQualityFactor("All matches", actualMatches.size(), expectedMatches.size()); printQualityFactor("All distinct aff matches", distinctAffActualMatchesCount, distinctAffExpectedMatchesCount); printQualityFactor("Correct matches", truePositives.size(), actualMatches.size()); } private void printFalsePositivesFactor(List<SimpleAffMatchResult> expectedMatches, List<SimpleAffMatchResult> actualMatches) { List<SimpleAffMatchResult> falsePositives = actualMatches.stream() .filter(x -> !expectedMatches.contains(x)) .collect(toList()); printQualityFactor("False positives", falsePositives.size(), actualMatches.size()); } private void printQualityFactor(String factorName, int goodCount, int totalCount) { double factorPercentage = ((double)goodCount/totalCount)*100; String text = String.format("%-30s %5.2f%% (%d/%d)", factorName + ":", factorPercentage, goodCount, totalCount); System.out.println(text); } private AffMatchingService createAffMatchingService() throws IOException { AffMatchingService affMatchingService = new AffMatchingService(); // readers affMatchingService.setAffiliationReader(new IisAffiliationReader()); affMatchingService.setOrganizationReader(new IisOrganizationReader()); // writer affMatchingService.setAffMatchResultWriter(new SimpleAffMatchResultWriter()); // matchers AffOrgMatcher docOrgRelationMatcher = createDocOrgRelationMatcher(sparkContext, inputDocProjDirPath, inputInferredDocProjDirPath, inputProjOrgDirPath, inputDocProjConfidenceThreshold); AffOrgMatcher nameMainSectionHashBucketMatcher = createNameMainSectionHashBucketMatcher(); AffOrgMatcher shortNameMainSectionHashBucketMatcher = createShortNameMainSectionHashBucketMatcher(); AffOrgMatcher alternativeNameMainSectionHashBucketMatcher = createAlternativeNameMainSectionHashBucketMatcher(); AffOrgMatcher firstWordsNameHashBucketMatcher = createNameFirstWordsHashBucketMatcher(); affMatchingService.setAffOrgMatchers(of(docOrgRelationMatcher, nameMainSectionHashBucketMatcher, shortNameMainSectionHashBucketMatcher, alternativeNameMainSectionHashBucketMatcher, firstWordsNameHashBucketMatcher)); AffMatchOrganizationAltNameFiller altNameFiller = createAffMatchOrganizationAltNameFiller(); affMatchingService.setAffMatchOrganizationAltNameFiller(altNameFiller); return affMatchingService; } private AffMatchOrganizationAltNameFiller createAffMatchOrganizationAltNameFiller() throws IOException { AffMatchOrganizationAltNameFiller altNameFiller = new AffMatchOrganizationAltNameFiller(); List<Set<String>> alternativeNamesDictionary = new CsvOrganizationAltNamesDictionaryFactory() .createAlternativeNamesDictionary(OrganizationAltNameConst.CLASSPATH_ALTERNATIVE_NAMES_CSV_FILES); altNameFiller.setAlternativeNamesDictionary(alternativeNamesDictionary); return altNameFiller; } }