package word2vec.corpuscreation;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import com.google.gson.Gson;
import word2vec.corpuscreation.CreateBiomedicalEntityCorpus.CalbCPubMedID.Concept;
import word2vec.corpuscreation.CreateBiomedicalEntityCorpus.CalbCPubMedID.Entity;
public class CreateBiomedicalEntityCorpus {
private static final String inputFile = "/home/quh/Arbeitsfläche/Entpackung/Arbeitsfläche/Code_Data/Calbc/output.json";
private static final String output = "/home/quh/Arbeitsfläche/CalbcSmallEntityCorpus.dat";
public static void main(String[] args) {
CreateBiomedicalEntityCorpus corpus = new CreateBiomedicalEntityCorpus();
corpus.action();
}
public CreateBiomedicalEntityCorpus() {
super();
}
public void action() {
Gson gson = new Gson();
File f = new File(inputFile);
PrintWriter writer = null;
String line = null;
BufferedReader reader = null;
try {
writer = new PrintWriter(output);
reader = new BufferedReader(new FileReader(f));
while ((line = reader.readLine()) != null) {
if (!line.equals("")) {
CalbCPubMedID id = gson.fromJson(line, CalbCPubMedID.class);
List<Entity> entities = id.getEntityList();
for (Entity e : entities) {
List<Concept> conceptList = e.getConceptList();
for (Concept c : conceptList) {
if (!generateID(c.getUrl()).equals("")) {
writer.print(convertURL(c.getUrl()) + " ");
}
}
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (writer != null) {
writer.close();
}
}
}
private String convertURL(String s) {
String splitter[] = s.split(":");
s = s.toLowerCase();
if(s.contains("disease")) {
return "UMLS_"+splitter[2];
} else if(s.contains("umls")) {
return "UMLS_"+splitter[2];
} else if(s.contains("uniprot")) {
return "UNIPROT_"+splitter[2];
} else if(s.contains("ncbi")) {
return "UMLS_"+splitter[2];
}
return "";
}
private String generateID(String line) {
String[] splitter = line.split(":");
String link = "";
if (splitter[1].equalsIgnoreCase("uniprot") && !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "UN_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("entrezgene") && !splitter[2].equalsIgnoreCase("")
&& splitter[2] != null) {
link = "NC_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("umls") && !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "LI_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("ncbi") && !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "NC_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("disease") && !splitter[2].equalsIgnoreCase("")
&& splitter[2] != null) {
link = "LI_" + splitter[2];
}
return link;
}
class CalbCPubMedID {
private String title;
private String abs;
private Metadata metadata;
private List<Entity> entityList;
private String id;
public CalbCPubMedID() {
abs = "";
entityList = new LinkedList<Entity>();
}
public void setTitle(String title) {
this.title = title;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public String getId() {
return id;
}
public String getAbs() {
return abs;
}
public void setAbs(String abs) {
this.abs = abs;
}
public List<Entity> getEntityList() {
return entityList;
}
public void addConcept(Entity entity) {
entityList.add(entity);
}
public Metadata getMetadata() {
return metadata;
}
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
public void setEntityList(List<Entity> entityList) {
this.entityList = entityList;
}
public void concatAbstract(String newabs) {
abs += newabs;
}
public class Entity {
private String keyword;
private List<Concept> conceptList;
private boolean isTitle;
private int position;
public Entity() {
conceptList = new LinkedList<Concept>();
position = 0;
}
public Entity(String keyword, boolean isTitle, int position) {
this.keyword = keyword;
this.isTitle = isTitle;
this.position = position;
conceptList = new LinkedList<Concept>();
}
public void addConcept(Concept concept) {
conceptList.add(concept);
}
public String getKeyword() {
return keyword;
}
public void setKeyword(String keyword) {
this.keyword = keyword;
}
public List<Concept> getConceptList() {
return conceptList;
}
public void setConceptList(List<Concept> conceptList) {
this.conceptList = conceptList;
}
public boolean isTitle() {
return isTitle;
}
public void setTitle(boolean isTitle) {
this.isTitle = isTitle;
}
public int getPosition() {
return position;
}
public void setPosition(int position) {
this.position = position;
}
///////////////// Experiment Methode //////////////////////////////
public boolean hasNCBIConcepts() {
boolean hasNCBI = false;
for (Iterator<Concept> iterator = conceptList.iterator(); iterator.hasNext();) {
Concept con = iterator.next();
if (con.getUrl().contains("ncbi")) {
hasNCBI = true;
break;
}
}
return hasNCBI;
}
}
public class Metadata {
private List<Author> authorList;
public Metadata() {
authorList = new LinkedList<Author>();
}
public List<Author> getAuthorList() {
return authorList;
}
public void addAuthor(Author author) {
authorList.add(author);
}
}
public class Author {
private String name;
private String forename;
private String shortname;
public Author() {
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getForename() {
return forename;
}
public void setForename(String forename) {
this.forename = forename;
}
public String getShortname() {
return shortname;
}
public void setShortname(String shortname) {
this.shortname = shortname;
}
}
public class Concept {
private String url;
public Concept() {
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
}
}