/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.source.pubmed; import java.util.Arrays; import java.util.Set; import org.carrot2.core.Document; import org.carrot2.source.SearchEngineResponse; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.carrot2.shaded.guava.common.collect.Sets; /** * A SAX content handler that collects the contents of PubMed abstracts. */ class PubMedContentHandler extends PathTrackingHandler { /** Collects PubMed results */ private SearchEngineResponse response; public PubMedContentHandler() { super.addTrigger(Arrays.asList( "/PubmedArticleSet/PubmedArticle", "/PubmedArticleSet/PubmedBookArticle"), new Trigger() { String pmid; String title; StringBuilder body = new StringBuilder(); { addTrigger(Arrays.asList( "/PubmedArticleSet/PubmedArticle/MedlineCitation/PMID", "/PubmedArticleSet/PubmedBookArticle/BookDocument/PMID"), new Trigger() { @Override public void afterElement(String localName, String path, String text) { assert pmid == null; pmid = text; } }); addTrigger(Arrays.asList( "/PubmedArticleSet/PubmedArticle/MedlineCitation/Article/ArticleTitle", "/PubmedArticleSet/PubmedBookArticle/BookDocument/Book/ArticleTitle"), new Trigger() { @Override public void afterElement(String localName, String path, String text) { assert title == null; title = text; } }); addTrigger(Arrays.asList( "/PubmedArticleSet/PubmedArticle/MedlineCitation/Article/Abstract/AbstractText", "/PubmedArticleSet/PubmedBookArticle/BookDocument/Book/Abstract/AbstractText"), new Trigger() { Set<String> skipLabels = Sets.newHashSet( "CONCLUSIONS", "METHODS", "RESULTS", "DIAGNOSIS/TESTING", "MANAGEMENT", "GENETIC COUNSELING"); String label; @Override public void onElement(String localName, String path, Attributes attrs) { label = attrs.getValue("", "NlmCategory"); } @Override public void afterElement(String localName, String path, String text) { if (label == null || !skipLabels.contains(label)) { if (body.length() > 0) { body.append(" ... "); } body.append(text); } } }); } @Override public void onElement(String localName, String path, Attributes attrs) { pmid = title = null; body.setLength(0); } @Override public void afterElement(String localName, String path, String text) { if (pmid != null) { response.results.add(new Document(title, body.toString(), "http://www.ncbi.nlm.nih.gov/pubmed/" + pmid, null, pmid)); } else { LoggerFactory.getLogger(PubMedContentHandler.class).warn("No PMID on a <PubmedArticle>?"); } } }); } @Override public void startDocument() throws SAXException { this.response = new SearchEngineResponse(); } public SearchEngineResponse getResponse() { return response; } }