package org.nextprot.api.tasks.solr.indexer.entry.diff; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.junit.Assert; import org.junit.Ignore; import org.junit.Test; import org.nextprot.api.core.domain.Entry; import org.nextprot.api.solr.index.EntryIndex.Fields; import org.nextprot.api.tasks.solr.indexer.entry.SolrDiffTest; import org.nextprot.api.tasks.solr.indexer.entry.impl.PublicationsFieldBuilder; public class PublicationFieldBuilderDiffTest extends SolrDiffTest { // This is the test for the Publication field of the entries core index (npentries1), not the publication index @Test public void testPublications() { String[] test_list = {"NX_Q8IWA4", "NX_O00115","NX_Q7Z6P3","NX_E5RQL4","NX_P43686","NX_Q7Z6P3", "NX_Q7Z713", "NX_P22102", "NX_Q7Z713", "NX_O00116", "NX_Q7Z713", "NX_O15056"}; for(int i=0; i < test_list.length; i++){ testPublications(getEntry(test_list[i])); } // for(int i=0; i < 10; i++){ testPublications(getEntry(i)); } // 'random' entries //Entry entry = getEntry("NX_P22102"); // fails //Entry entry = getEntry("NX_P61604"); //testPublications(entry); } public void testPublications(Entry entry) { String entryName = entry.getUniqueName(); System.out.println("Testing: " + entryName); PublicationsFieldBuilder pfb = new PublicationsFieldBuilder(); pfb.initializeBuilder(entry); Set<String> expectedPublisRaw = new TreeSet<String>((List) getValueForFieldInCurrentSolrImplementation(entryName, Fields.PUBLICATIONS)); Set<String> expectedValues = new TreeSet<String>(); for (String s : expectedPublisRaw) { String indextoken; // Many values in current index end with spaces -> trimming if(s.startsWith("<p>")) { // like <p><b>title : </b>Mapping the hallmarks of lung adenocarcinoma with massively parallel sequencing.</p><p><b>journal</b> : Cell - Cell</p><p><b>nlmid:</b>0413066</p><p><b>authors : </b>Imielinski Marcin M", indextoken = getValueFromRawData(s,"title"); if(indextoken != null && indextoken.length() > 1) { // Titles from foreign lanuage journals are often enclosed in square brackets, they are stripped in the api but not in current index if(indextoken.startsWith("[")) indextoken = indextoken.substring(1, indextoken.length()-1); expectedValues.add(removeDoubleSpace(indextoken, "title")); } indextoken = getValueFromRawData(s,"journal"); if(indextoken != null) expectedValues.add(indextoken.substring(3)); indextoken = getValueFromRawData(s,"nlmid"); if(indextoken != null) expectedValues.add(removeDoubleSpace(indextoken, "nlmid")); indextoken = getValueFromRawData(s,"authors"); if(indextoken != null) expectedValues.add(removeDoubleSpace(indextoken,"authors")); } else if(s.endsWith("</p>")) { indextoken = s.substring(0,s.indexOf("</p>")).trim(); expectedValues.add(removeDoubleSpace(indextoken, "unknown subfield")); // like "Meyerson Matthew M</p>" } else expectedValues.add(removeDoubleSpace(s, "unknown subfield 2")); } //Set<String> expectedPubliscopy = new TreeSet<String>(expectedPublis); /*for(String elem : PublicationSet) System.out.println(elem); System.err.println(PublicationSet.size() + " elements in the new index"); */ Set<String> tmpset = new TreeSet<String>(pfb.getFieldValue(Fields.PUBLICATIONS, List.class)); TreeSet<String> PublicationSet = new TreeSet<>(); for (String s: tmpset) { PublicationSet.add(removeDoubleSpace(s, "new index data")); } /* Set<String> PublicationSetcopy = new TreeSet<String>(PublicationSet); PublicationSet.removeAll(expectedValues); System.err.println(PublicationSet.size() + " elements are only in the new index"); for(String elem : PublicationSet) System.out.println(elem); expectedValues.removeAll(PublicationSetcopy); System.err.println("\n" + expectedValues.size() + " elements are only in the old index"); for(String elem : expectedValues) System.out.println(elem); */ //Assert.assertEquals( expectedValues.size(), PublicationSet.size()); Assert.assertTrue( PublicationSet.size() >= expectedValues.size()); //System.err.println("expected: " + expectedValues.size() + " actual: " + PublicationSet.size()); if(expectedValues.size() < PublicationSet.size()) { PublicationSet.removeAll(expectedValues); System.err.println("WARNING: (" + entryName + ") " + PublicationSet.size() + " element(s) are only in the new index: " + PublicationSet); } int pubCount, expectedPubCount; expectedPubCount = (int) getValueForFieldInCurrentSolrImplementation(entryName, Fields.PUBLI_CURATED_COUNT); pubCount = pfb.getFieldValue(Fields.PUBLI_CURATED_COUNT, Integer.class); //System.err.println("PUBLI_CURATED_COUNT: " + expectedPubCount + " Now: " + pubCount); Assert.assertEquals(expectedPubCount, pubCount); expectedPubCount = (int) getValueForFieldInCurrentSolrImplementation(entryName, Fields.PUBLI_COMPUTED_COUNT); pubCount = pfb.getFieldValue(Fields.PUBLI_COMPUTED_COUNT, Integer.class); //System.err.println("PUBLI_COMPUTED_COUNT: " + expectedPubCount + " Now: " + pubCount); Assert.assertEquals(expectedPubCount, pubCount); expectedPubCount = (int) getValueForFieldInCurrentSolrImplementation(entryName, Fields.PUBLI_LARGE_SCALE_COUNT); pubCount = pfb.getFieldValue(Fields.PUBLI_LARGE_SCALE_COUNT, Integer.class); //System.err.println("PUBLI_LARGE_SCALE_COUNT: " + expectedPubCount + " Now: " + pubCount); Assert.assertEquals(expectedPubCount, pubCount); float expectedScore = (float) getValueForFieldInCurrentSolrImplementation(entryName, Fields.INFORMATIONAL_SCORE); float score = pfb.getFieldValue(Fields.INFORMATIONAL_SCORE, Float.class); //System.err.println("INFORMATIONAL_SCORE: " + expectedScore + " Now: " + score); Assert.assertEquals(expectedScore, score, 0.001); } private String removeDoubleSpace(String s, String fname) { //System.out.println(fname + " avant:<" + s + ">"); //if (s.contains("Cancer Genome Atlas Research")) System.out.println(fname + " ascii: " + (int)s.charAt(s.length()-2)+ " " + (int)s.charAt(s.length()-1)); s.trim(); int lng=s.length(); while(true) { s = s.replaceAll(" ", " "); if (s.length()==lng) break; lng=s.length(); } //System.out.println(fname + " apres:<" + s + ">"); //if (s.contains("Cancer Genome Atlas Research")) System.out.println(fname + " ascii: " + (int)s.charAt(s.length()-2) + " " + (int)s.charAt(s.length()-1)); return s; } }