package org.mindinformatics.services.connector.pubmed.dataaccess;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.GregorianCalendar;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.commons.lang.StringUtils;
// http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=helppubmed&part=pubmedhelp
// http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=helplinks&part=linkshelp
/**
* @author Paolo Ciccarese <paolo.ciccarese@gmail.com>
*/
public class PubmedQueryTermBuilder {
// http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T43/?report=objectonly
String[] stopwords = {
"a", "about", "again", "all", "almost", "also", "although", "always", "among", "an", "and", "another", "any", "are", "as", "at",
"be", "because", "been", "before", "being", "between", "both", "but", "by",
"can", "could",
"did", "do", "does", "done", "due", "during",
"each", "either", "enough", "especially", "etc",
"for", "found", "from", "further",
"had", "has", "have", "having", "here", "how", "however",
"i", "if", "in", "into", "is", "it", "its", "itself",
"just",
"kg", "km",
"made", "mainly", "make", "may", "mg", "might", "ml", "mm", "most", "mostly", "must",
"nearly", "neither", "no", "nor",
"obtained", "of", "often", "on", "our", "overall",
"perhaps", "pmid",
"quite",
"rather", "really", "regarding",
"seem", "seen", "several", "should", "show", "showed", "shown", "shows", "significantly", "since", "so", "some", "such",
"than", "that", "the", "their", "theirs", "them", "then", "there", "therefore", "these", "they", "this", "those", "through", "thus","to",
"upon", "use", "used", "using",
"various", "very",
"was", "we", "were", "what", "when", "which", "while", "with", "within", "without", "would"
};
public static final Integer EARLIEST_PUBLICATION_START_YEAR = 1900;
public static String AUTHOR_FIELD_TAG="AU";
public static String JOURNAL_TITLE_FIELD_TAG="TA";
public static String TITLE_FIELD_TAG="TI";
public static String TITLE_AND_ABSTRACT_FIELD_TAG="TIAB";
public static String MESH_TERM_FIELD_TAG="MH";
public static String PUBLICATION_DATE_TERM_FIELD_TAG="DP";
public static String PUBMED_ID_FIELD_TAG="PMID";
public static String ALL_FIELD_TAG="ALL";
private List<String> authors = new ArrayList<String>();
private List<String> journalTitles = new ArrayList<String>();
private List<String> titles = new ArrayList<String>();
private List<String> titleAndAbstract = new ArrayList<String>();
private List<String> meshTerms = new ArrayList<String>();
private List<String> pubmedIds = new ArrayList<String>();
private List<String> all = new ArrayList<String>();
private List<String> publicationTypes = new ArrayList<String>();
private Integer publicationStartMonthIndex;
private Integer publicationEndMonthIndex;
private Integer publicationStartYearIndex;
private Integer publicationEndYearIndex;
private Map<String,List<String>> tagToFieldValuesMap = new LinkedHashMap<String,List<String>>();
public PubmedQueryTermBuilder(){
tagToFieldValuesMap.put(AUTHOR_FIELD_TAG, authors);
tagToFieldValuesMap.put(JOURNAL_TITLE_FIELD_TAG,journalTitles);
tagToFieldValuesMap.put(TITLE_FIELD_TAG,titles);
tagToFieldValuesMap.put(TITLE_AND_ABSTRACT_FIELD_TAG, titleAndAbstract);
tagToFieldValuesMap.put(MESH_TERM_FIELD_TAG,meshTerms);
}
public void setPublicationDateRange(Integer startMonthIndex, Integer startYearIndex,Integer endMonthIndex, Integer endYearIndex){
this.publicationStartMonthIndex = startMonthIndex;
this.publicationStartYearIndex = startYearIndex;
this.publicationEndMonthIndex = endMonthIndex;
this.publicationEndYearIndex = endYearIndex;
}
public void addPublicationTypes(Collection<String> publicationTypes){
this.publicationTypes.addAll(publicationTypes);
}
public void addPubmedIds(Collection<String> pubmedIds){
this.pubmedIds.addAll(pubmedIds);
}
public void add(Collection<String> pubmedCentralIds){
this.all.addAll(pubmedCentralIds);
}
public void addAuthor(String authorName){
authors.add(authorName);
}
public void addAuthors(Collection<String> authorNames){
authors.addAll(authorNames);
}
public void addJournalTitle(String journalTitle){
journalTitles.add(journalTitle);
}
public void addJournalTitles(Collection<String>journalTitles){
journalTitles.addAll(journalTitles);
}
public void addJournalArticleTitleWord(String title){
// for(String word: stopwords) {
// if(word.equals(title)) return;
// }
titles.add(title);
}
public void addJournalArticleTitleWords(Collection<String>titleWords){
// for(String titleWord: titleWords) {
// System.out.println(">> " + titleWord);
// addJournalArticleTitleWord(titleWord);
// //if(!stopwords.contains(titleWord)) titles.add(titleWord);
// }
titles.addAll(titleWords);
}
// public void addTitleAndAbstractSearchWord(String aTerm){
// titleAndAbstract.add(aTerm);
// }
// public void addTitleAndAbstractSearchWords(Collection<String>searchWords){
// titleAndAbstract.addAll(searchWords);
// }
public void addMeshTerm(String meshTerm){
this.meshTerms.add(meshTerm);
}
public void addMeshTerms(Collection<String>meshTerms){
this.meshTerms.addAll(meshTerms);
}
private String searchClause(List<String> fieldList,String fieldTag){
StringBuilder builder = new StringBuilder();
String connectingOp = "+AND+";
for(String fieldValue: fieldList){
builder.append(this.cleanupSearchTerm(fieldValue, fieldTag));
builder.append("[");
builder.append(fieldTag);
builder.append("]");
builder.append(connectingOp);
}
if (builder.length() > 0){
return builder.substring(0,builder.length() - connectingOp.length());
}
return "";
}
/*
private String getPublicationTypeSearchClause(){
return getSearchClauseString(this.publicationTypes,PUBLICATION_TYPE_FIELD_TAG,"+OR+");
}
*/
private String getPubmedIdSearchClause(){
return getSearchClauseString(this.pubmedIds,PUBMED_ID_FIELD_TAG,"+OR+");
}
private String getAllSearchClause(){
return getSearchClauseString(this.all,ALL_FIELD_TAG,"+OR+");
}
private String getSearchClauseString(Collection<String> operands,String fieldTag,String connectionOp){
StringBuilder builder = new StringBuilder();
builder.append("(");
int i = 0;
for(String operand : operands){
builder.append(operand);
builder.append("[");
builder.append(fieldTag);
builder.append("]");
if(i < operands.size() -1){
builder.append(connectionOp);
}
i++;
}
builder.append(")");
return builder.toString();
}
private String publicationDateSearchTerm(){
if(publicationStartYearIndex == null && publicationEndYearIndex == null){
return null;
}
StringBuilder builder = new StringBuilder();
if (this.publicationStartYearIndex != null){
builder.append(this.dateTerm(null, publicationStartMonthIndex, publicationStartYearIndex));
}
if (publicationStartYearIndex == null && publicationEndYearIndex != null){
builder.append(this.dateTerm(null,null,EARLIEST_PUBLICATION_START_YEAR));
}
builder.append(":");
if (this.publicationEndYearIndex != null){
builder.append(this.dateTerm(null, publicationEndMonthIndex, publicationEndYearIndex));
} else {
GregorianCalendar calendar = new GregorianCalendar();
System.out.println(calendar.get(GregorianCalendar.YEAR));
builder.append(this.dateTerm(calendar.get(GregorianCalendar.DAY_OF_MONTH), calendar.get(GregorianCalendar.MONTH)+1, calendar.get(GregorianCalendar.YEAR)));
}
builder.append("[");
builder.append(PUBLICATION_DATE_TERM_FIELD_TAG);
builder.append("]");
return builder.toString();
}
private String dateTerm(Integer day, Integer month, Integer year){
if (year == null){
return null;
}
StringBuilder builder = new StringBuilder();
builder.append(year.toString());
if(month != null){
builder.append('/');
builder.append(month.toString());
if (day != null){
builder.append('/');
builder.append(day.toString());
}
}
return builder.toString();
}
private List<String> getSearchClauses(){
List<String> searchClauses = new ArrayList<String>();
Set<Entry<String,List<String>>> tagAndFieldValues = tagToFieldValuesMap.entrySet();
for(Entry<String,List<String>> tagAndFieldValueList : tagAndFieldValues){
String currentClause = this.searchClause(tagAndFieldValueList.getValue(),tagAndFieldValueList.getKey());
if (StringUtils.isNotEmpty(currentClause)){
searchClauses.add(currentClause);
}
}
String publicationDateSearchTerm = publicationDateSearchTerm();
if (StringUtils.isNotEmpty(publicationDateSearchTerm)){
searchClauses.add(publicationDateSearchTerm);
}
if(pubmedIds.size() > 0){
searchClauses.add(this.getPubmedIdSearchClause());
}
if(all.size() > 0){
searchClauses.add(this.getAllSearchClause());
}
if(publicationTypes.size() > 0){
//searchClauses.add(this.getPublicationTypeSearchClause());
}
return searchClauses;
}
public String toString(){
StringBuilder builder = new StringBuilder();
String connectingOp = "+AND+";
for(String currentClause : this.getSearchClauses()){
builder.append(currentClause);
builder.append(connectingOp);
}
if (builder.length() > 0){
return builder.substring(0,builder.length() - connectingOp.length());
}
return null;
}
private String cleanupSearchTerm(String searchTerm, String fieldTag){
StringTokenizer tokenizer = new StringTokenizer(searchTerm);
StringBuilder builder = new StringBuilder();
while(tokenizer.hasMoreTokens()){
String currentToken = tokenizer.nextToken();
try {
boolean isStopWord = false;
for(String word: stopwords) {
if(word.equals(currentToken.trim())) isStopWord=true;
}
if(!isStopWord) {
builder.append(URLEncoder.encode(currentToken, "utf-8"));
if (tokenizer.hasMoreTokens()){
builder.append("["+fieldTag+"]+AND+");
}
}
}catch(UnsupportedEncodingException e){
continue;
}
}
return builder.toString();
}
}