/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package com.meaningcloud.gate.clients; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import com.meaningcloud.gate.param.TopicsBean; import gate.util.InvalidOffsetException; /** * * @author ADRIAN */ public class TopicsClient { public static class Recursive {// Required in order to perform recursive // calls to the collect info method // It is important to traverse the whole // tree and get the intra-sentence offsets // Inter-sentence offsets are kept in the // core, not in the client functions public List<Annot> hack_ann_list = new ArrayList<Annot>(); public List<TopicsBean.Entity> hack_entity_list = new ArrayList<TopicsBean.Entity>(); } public static class AnnotationOffsets { public Long startOff = 0L, endOff = 0L; } public static List<Annot> transform(List<TopicsBean.Entity> entity_list, String nameNode, String ud) throws UnsupportedEncodingException { List<Annot> annotations = new ArrayList<Annot>(); Iterator<TopicsBean.Entity> it = entity_list.iterator(); int count = 0; while (it.hasNext()) { List<AnnotationOffsets> annOffList = new ArrayList<AnnotationOffsets>(); annOffList.clear(); TopicsBean.Entity ent = (TopicsBean.Entity) it.next(); Iterator<TopicsBean.Entity.variant> it2 = ent.variant_list .iterator(); gate.FeatureMap fm = gate.Factory.newFeatureMap(); fm.clear(); count = 0; while (it2.hasNext()) { AnnotationOffsets annot = new AnnotationOffsets(); TopicsBean.Entity.variant var = (TopicsBean.Entity.variant) it2 .next(); if (!var.inip.isEmpty() && !var.endp.isEmpty()) { annot.startOff = Long.parseLong(var.inip, 10); annot.endOff = Long.parseLong(var.endp, 10) + 1; count++; annOffList.add(annot); } if (!ent.form.isEmpty()) fm.put("Form", ent.form); if (!ent.dictionary.isEmpty()) fm.put("Dictionary", new String(ent.dictionary.getBytes(), "UTF-8")); if (!ent.id.isEmpty()) fm.put("ID", ent.id); if (!ent.relevance.isEmpty()) fm.put("Relevance", ent.relevance); if (ent.sementity_list.size() > 0) { ArrayList<String> sementity_class = new ArrayList<String>(), sementity_confidence = new ArrayList<String>(), sementity_fiction = new ArrayList<String>(), sementity_id = new ArrayList<String>(), sementity_type = new ArrayList<String>(); for (int i_it = 0; i_it < ent.sementity_list.size(); i_it++) { sementity_class .add(!ent.sementity_list.get(i_it).sementity_class .isEmpty() ? new String( ent.sementity_list.get(i_it).sementity_class .getBytes(), "utf-8") : ""); sementity_confidence .add(!ent.sementity_list.get(i_it).sementity_confidence .isEmpty() ? new String( ent.sementity_list.get(i_it).sementity_confidence .getBytes(), "utf-8") : ""); sementity_fiction .add(!ent.sementity_list.get(i_it).sementity_fiction .isEmpty() ? new String( ent.sementity_list.get(i_it).sementity_fiction .getBytes(), "utf-8") : ""); sementity_id .add(!ent.sementity_list.get(i_it).sementity_id .isEmpty() ? new String( ent.sementity_list.get(i_it).sementity_id .getBytes(), "utf-8") : ""); sementity_type .add(!ent.sementity_list.get(i_it).sementity_type .isEmpty() ? new String( ent.sementity_list.get(i_it).sementity_type .getBytes(), "utf-8") : ""); } fm.put("sementity_class", sementity_class); fm.put("sementity_confidence", sementity_confidence); fm.put("sementity_fiction", sementity_fiction); fm.put("sementity_id", sementity_id); fm.put("sementity_type", sementity_type); } if (ent.semld_list.size() > 0) { ArrayList<String> semld = new ArrayList<String>(), sourceDic = new ArrayList<String>(); Iterator iter = ent.semld_list.iterator(); while (iter.hasNext()) { TopicsBean.Entity.semld smld = (com.meaningcloud.gate.param.TopicsBean.Entity.semld) iter .next(); sourceDic.add(!smld.sourceDic.isEmpty() ? new String( smld.sourceDic.getBytes(), "utf-8") : ""); semld.add(!smld.semld.isEmpty() ? new String(smld.semld .getBytes(), "utf-8") : ""); } fm.put("semld_source", sourceDic); fm.put("semld_value", semld); } if (ent.standard_list.size() > 0) { ArrayList<String> standard_value = new ArrayList<String>(), standard_key = new ArrayList<String>(); for (int i_it = 0; i_it < ent.standard_list.size(); i_it++) { standard_value.add(!ent.standard_list.get(i_it).value .isEmpty() ? new String(ent.standard_list .get(i_it).value.getBytes(), "utf-8") : ""); standard_key.add(!ent.standard_list.get(i_it).name .isEmpty() ? new String(ent.standard_list .get(i_it).name.getBytes(), "utf-8") : ""); } fm.put("standard_key", standard_key); fm.put("standard_value", standard_value); } if (ent.semgeo_list.size() > 0) { for (int i_it = 0; i_it < ent.semgeo_list.size(); i_it++) { int size = ent.semgeo_list.get(i_it).semgeo_list.size(); for (int j_it = 0; j_it < size; j_it++) { if (!ent.semgeo_list.get(i_it).semgeo_list .get(j_it).form.isEmpty()) fm.put(ent.semgeo_list.get(i_it).semgeo_list .get(j_it).name + /* "_"+(j_it+1)+ */"_form", ent.semgeo_list.get(i_it).semgeo_list .get(j_it).form); if (!ent.semgeo_list.get(i_it).semgeo_list .get(j_it).id.isEmpty()) fm.put(ent.semgeo_list.get(i_it).semgeo_list .get(j_it).name + /* "_"+(j_it+1)+ */"_id", ent.semgeo_list.get(i_it).semgeo_list .get(j_it).id); int size_std = ent.semgeo_list.get(i_it).semgeo_list .get(j_it).stdArray.size(); for (int k_it = 0; k_it < size_std; k_it++) { if (!ent.semgeo_list.get(i_it).semgeo_list .get(j_it).stdArray.get(k_it).name .isEmpty()) fm.put(ent.semgeo_list.get(i_it).semgeo_list .get(j_it).name + "_" + (j_it + 1) + "_std." + ent.semgeo_list.get(i_it).semgeo_list .get(j_it).stdArray .get(k_it).name, ent.semgeo_list.get(i_it).semgeo_list .get(j_it).stdArray .get(k_it).value); } } } } if (ent.semtheme_list.size() > 0) { ArrayList<String> semtheme_id = new ArrayList<String>(), semtheme_type = new ArrayList<String>(); for (int i_it = 0; i_it < ent.semtheme_list.size(); i_it++) { semtheme_id.add(!ent.semtheme_list.get(i_it).id .isEmpty() ? new String(ent.semtheme_list .get(i_it).id.getBytes(), "utf-8") : ""); semtheme_type.add(!ent.semtheme_list.get(i_it).type .isEmpty() ? new String(ent.semtheme_list .get(i_it).type.getBytes(), "utf-8") : ""); } fm.put("semtheme_id", semtheme_id); fm.put("semtheme_type", semtheme_type); } if (ent.variant_list.size() > 0) { ArrayList<String> variant = new ArrayList<String>(); ArrayList<ArrayList<Long>> offsets = new ArrayList<ArrayList<Long>>(); for (int i_it = 0; i_it < ent.variant_list.size(); i_it++) { ArrayList<Long> offset = new ArrayList<Long>(); variant.add(!ent.variant_list.get(i_it).form.isEmpty() ? ent.variant_list .get(i_it).form : ""); offset.add(Long.parseLong(ent.variant_list.get(i_it).inip)); offset.add(Long.parseLong(ent.variant_list.get(i_it).endp) + 1); offsets.add(offset); } fm.put("variant_form", variant); fm.put("variant_offset", offsets); } } Annot annotat = new Annot(); for (int k_it = 0; k_it < count; k_it++) { annotat = new Annot(); annotat.startOff = annOffList.get(k_it).startOff; annotat.endOff = annOffList.get(k_it).endOff; annotat.Name = nameNode; annotat.fm = fm; annotations.add(annotat); } } return annotations; } public static class Annot { public Long startOff = 0L, endOff = 0L; public String Name = ""; public gate.FeatureMap fm = gate.Factory.newFeatureMap(); } public static Recursive collectInfo(Element response, String nameNode, String ud) throws InvalidOffsetException, UnsupportedEncodingException { List<Annot> annotations = new ArrayList<Annot>(); Recursive rec = new Recursive(); NodeList nodeL = response.getElementsByTagName(nameNode); if (nodeL.getLength() > 0) { List<TopicsBean.Entity> entity_list = new ArrayList<TopicsBean.Entity>( nodeL.getLength()); for (int i = 0; i < nodeL.getLength(); i++) { TopicsBean.Entity ent = new TopicsBean.Entity(); Node node = nodeL.item(i); NodeList data_node = node.getChildNodes(); for (int j = 0; j < data_node.getLength(); j++) { Node n = data_node.item(j); String name = n.getNodeName(); String value = new String(n.getTextContent().getBytes(), "UTF-8"); if ("form".equals(name)) { ent.form = new String(n.getTextContent().getBytes(), "UTF-8"); } else if ("subentity_list".equals(name)) { try { NodeList subentities = n.getChildNodes(); Recursive subannotations = collectInfo( (Element) subentities, "subentity", ud); Iterator<TopicsBean.Entity> suba_it = subannotations.hack_entity_list .iterator(); while (suba_it.hasNext()) { entity_list.add((TopicsBean.Entity) suba_it .next()); } } catch (InvalidOffsetException e) { System.err.println("Exception in subentities: " + e.toString()); } catch (UnsupportedEncodingException e) { System.err.println("Exception in subentities: " + e.toString()); } } else if ("dictionary".equals(name)) { ent.dictionary = n.getTextContent(); } else if ("id".equals(name)) { ent.id = new String(n.getTextContent().getBytes(), "UTF-8"); } else if ("relevance".equals(name)) { ent.relevance = new String(n.getTextContent() .getBytes(), "UTF-8"); } else if ("sementity".equals(name)) { NodeList typeLi = n.getChildNodes(); TopicsBean.Entity.sementity sem = new TopicsBean.Entity.sementity(); for (int li_it = 0; li_it < typeLi.getLength(); li_it++) { Node sem_node = typeLi.item(li_it); String name_aux = sem_node.getNodeName(); // switch (name_aux) { if (name_aux.equals("type")) sem.sementity_type = new String(sem_node .getTextContent().getBytes(), "UTF-8"); else if (name_aux.equals("class")) sem.sementity_class = new String(sem_node .getTextContent().getBytes(), "UTF-8"); else if (name_aux.equals("fiction")) sem.sementity_fiction = new String(sem_node .getTextContent().getBytes(), "UTF-8"); else if (name_aux.equals("confidence")) sem.sementity_confidence = new String(sem_node .getTextContent().getBytes(), "UTF-8"); else if (name_aux.equals("id")) sem.sementity_id = new String(sem_node .getTextContent().getBytes(), "UTF-8"); // } } ent.sementity_list.add(sem); } else if ("semgeo_list".equals(name)) { NodeList semgeoLi = n.getChildNodes(); for (int semgeo_li = 0; semgeo_li < semgeoLi .getLength(); semgeo_li++) { TopicsBean.Entity.semgeo s = new TopicsBean.Entity.semgeo(); Node semge = (Node) semgeoLi.item(semgeo_li); NodeList aux = semge.getChildNodes(); for (int aux_it = 0; aux_it < aux.getLength(); aux_it++) { Node aux2 = (Node) aux.item(aux_it); TopicsBean.Entity.semgeo.element e = new TopicsBean.Entity.semgeo.element(); NodeList aux5 = aux2.getChildNodes(); e.name = new String(aux2.getNodeName() .getBytes(), "UTF-8"); for (int aux6_it = 0; aux6_it < aux5 .getLength(); aux6_it++) { Node aux7 = (Node) aux5.item(aux6_it); if (aux7.getNodeName().equals("form")) e.form = new String(aux7 .getTextContent().getBytes(), "UTF-8"); else if (aux7.getNodeName().equals("id")) e.id = new String(aux7.getTextContent() .getBytes(), "UTF-8"); else if (aux7.getNodeName().contains("std")) { TopicsBean.Entity.semgeo.element.std std_local = new TopicsBean.Entity.semgeo.element.std(); std_local.name = new String(aux7 .getNodeName().getBytes(), "UTF-8"); std_local.value = new String(aux7 .getTextContent().getBytes(), "UTF-8"); e.stdArray.add(std_local); } } s.semgeo_list.add(e); } ent.semgeo_list.add(s); } } else if ("semld_list".equals(name)) { NodeList semldLi = n.getChildNodes(); Map<String, String> dictionaries = new HashMap<String, String>(); int count = 0; for (int semld_li = 0; semld_li < semldLi.getLength(); semld_li++) { Node aux = semldLi.item(semld_li); String name_aux = aux.getNodeName(); if (name_aux.equals("semld")) { // NodeList dics = null; if ((aux.hasChildNodes()) && (aux.getFirstChild().getNodeName() != "#cdata-section")) { NodeList dics = aux.getChildNodes(); for (int dics_it = 0; dics_it < dics .getLength(); dics_it++) { TopicsBean.Entity.semld s = new TopicsBean.Entity.semld(); Node dic = dics.item(dics_it); s.sourceDic = new String(dic .getNodeName().getBytes(), "UTF-8"); s.semld = new String(dic .getTextContent().getBytes(), "UTF-8"); /* * if(dictionaries.containsKey(new * String * (dic.getNodeName().getBytes(),"UTF-8" * ))){ String dict = new * String(dic.getNodeName * ().getBytes(),"UTF-8"); * dict+="_"+count; s.sourceDic = dict; * count++; }else{ dictionaries.put(new * String * (dic.getNodeName().getBytes(),"UTF-8" * ), new * String(dic.getNodeName().getBytes * (),"UTF-8")); s.sourceDic = new * String * (dic.getNodeName().getBytes(),"UTF-8" * ); } */ ent.semld_list.add(s); } } else { TopicsBean.Entity.semld s = new TopicsBean.Entity.semld(); s.semld = new String(aux.getTextContent() .getBytes(), "UTF-8"); s.sourceDic = ""; ent.semld_list.add(s); } } } } else if ("semrefer_list".equals(name)) { NodeList semreferLi = n.getChildNodes(); for (int semrefer_li = 0; semrefer_li < semreferLi .getLength(); semrefer_li++) { // TODO:!!!! } } else if ("semtheme_list".equals(name)) { NodeList semthemeLi = n.getChildNodes(); for (int semtheme_li = 0; semtheme_li < semthemeLi .getLength(); semtheme_li++) { Node semthem = (Node) semthemeLi.item(semtheme_li); NodeList aux = semthem.getChildNodes(); TopicsBean.Entity.semtheme s = new TopicsBean.Entity.semtheme(); for (int aux_it = 0; aux_it < aux.getLength(); aux_it++) { Node aux2 = (Node) aux.item(aux_it); String aux_name = aux2.getNodeName(); if ("id".equals(aux_name)) { s.id = new String(aux2.getTextContent() .getBytes(), "UTF-8"); } else if ("type".equals(aux_name)) { s.type = new String(aux2.getTextContent() .getBytes(), "UTF-8"); } } ent.semtheme_list.add(s); } } else if ("standard_list".equals(name)) { NodeList standardLi = n.getChildNodes(); for (int std_li = 0; std_li < standardLi.getLength(); std_li++) { Node std = (Node) standardLi.item(std_li); TopicsBean.Entity.standard std_local = new TopicsBean.Entity.standard(); std_local.name = new String(std.getFirstChild() .getNodeName().getBytes(), "utf-8"); std_local.value = new String(std.getTextContent() .getBytes(), "UTF-8"); ent.standard_list.add(std_local); } } else if ("variant_list".equals(name)) { NodeList variantLi = n.getChildNodes(); for (int semgeo_li = 0; semgeo_li < variantLi .getLength(); semgeo_li++) { TopicsBean.Entity.variant v = new TopicsBean.Entity.variant(); Node var = (Node) variantLi.item(semgeo_li); NodeList aux = var.getChildNodes(); for (int aux_it = 0; aux_it < aux.getLength(); aux_it++) { Node aux2 = (Node) aux.item(aux_it); String aux_name = aux2.getNodeName(); if ("form".equals(aux_name)) { v.form = new String(aux2.getTextContent() .getBytes(), "UTF-8"); // v.form = aux2.getTextContent(); } else if ("inip".equals(aux_name)) { v.inip = new String(aux2.getTextContent() .getBytes(), "UTF-8"); } else if ("endp".equals(aux_name)) { v.endp = new String(aux2.getTextContent() .getBytes(), "UTF-8"); } } ent.variant_list.add(v); } } } entity_list.add(ent); } annotations = transform(entity_list, nameNode, ud); rec.hack_ann_list = annotations; rec.hack_entity_list = entity_list; } return rec; } public static List<Annot> collectShort(Element response, String nameNode) throws InvalidOffsetException, UnsupportedEncodingException { List<Annot> annotations = new ArrayList<Annot>(); NodeList nodeL = response.getElementsByTagName(nameNode); for (int i = 0; i < nodeL.getLength(); i++) { Long startOffset = 0L, endOffset = 0L; Node node = nodeL.item(i); NodeList data_node = node.getChildNodes(); String form = ""; String normalizedForm = ""; String actualTime = ""; String precision = ""; String amount_form = ""; String numericValue = ""; String currency = ""; String unit = ""; String who_form = ""; String who_lemma = "";// ToDo: list String verb_lemma = "";// ToDo: list String type = ""; String subject_form = ""; String subject_lemmas = ""; String subject_sensesID = ""; String subject_semantic_lemma = ""; String verb_form = ""; String verb_lemmas = ""; String verb_sensesID = ""; String verb_semanticLemma = ""; String complement_form = "";// ToDo: complement_list!!! String complement_type = ""; String degree = ""; boolean flag_inip = false, flag_endp = false; for (int j = 0; j < data_node.getLength(); j++) { Node n = data_node.item(j); String name = n.getNodeName(); if ("form".equals(name)) { form = n.getTextContent(); } else if ("normalized_form".equals(name)) { normalizedForm = n.getTextContent(); } else if ("atual_time".equals(name)) { actualTime = n.getTextContent(); } else if ("precision".equals(name)) { precision = n.getTextContent(); } else if ("amount_form".equals(name)) { amount_form = n.getTextContent(); } else if ("numeric_value".equals(name)) { numericValue = n.getTextContent(); } else if ("currency".equals(name)) { currency = n.getTextContent(); } else if ("unit".equals(name)) { unit = n.getTextContent(); } else if ("who".equals(name)) { NodeList typeLi = n.getChildNodes(); TopicsBean.Quotation.Who who_elem = new TopicsBean.Quotation.Who(); for (int li_it = 0; li_it < typeLi.getLength(); li_it++) { Node sem_node = typeLi.item(li_it); String name_aux = sem_node.getNodeName(); // switch (name_aux) { if (name_aux.equals("form")) who_elem.form = new String(sem_node .getTextContent().getBytes(), "UTF-8"); else if (name_aux.equals("lemma")) who_elem.lemma = new String(sem_node .getTextContent().getBytes(), "UTF-8"); } who_form = who_elem.form; who_lemma = who_elem.lemma; } else if ("verb".equals(name)) { NodeList typeLi = n.getChildNodes(); TopicsBean.Quotation.Who verb_elem = new TopicsBean.Quotation.Who(); for (int li_it = 0; li_it < typeLi.getLength(); li_it++) { Node sem_node = typeLi.item(li_it); String name_aux = sem_node.getNodeName(); // switch (name_aux) { if (name_aux.equals("form")) verb_elem.form = new String(sem_node .getTextContent().getBytes(), "UTF-8"); else if (name_aux.equals("lemma")) verb_elem.lemma = new String(sem_node .getTextContent().getBytes(), "UTF-8"); } verb_form = verb_elem.form; verb_lemma = verb_elem.lemma; } else if ("type".equals(name)) { type = n.getTextContent(); } else if ("inip".equals(name)) { String inip_value = n.getTextContent(); if (inip_value != null) { startOffset = Long.parseLong(inip_value, 10); flag_inip = true; } else { System.err.println("error inip"); } } else if ("endp".equals(name)) { String endp_value = n.getTextContent(); if (endp_value != null) { endOffset = Long.parseLong(endp_value, 10); flag_endp = true; } else { System.err.println("error endp"); } } if (((startOffset + endOffset) != 0) && (flag_inip & flag_endp)) {// If // the // entity // was // present // in // the // text try { gate.FeatureMap fm = gate.Factory.newFeatureMap(); if (!form.isEmpty()) fm.put("form", form); if (!normalizedForm.isEmpty()) fm.put("normalizedForm", normalizedForm); if (!actualTime.isEmpty()) fm.put("actualTime", actualTime); if (!precision.isEmpty()) fm.put("precision", precision); if (!amount_form.isEmpty()) fm.put("amount_form", amount_form); if (!numericValue.isEmpty()) fm.put("numericValue", numericValue); if (!currency.isEmpty()) fm.put("currency", currency); if (!unit.isEmpty()) fm.put("unit", unit); if (!who_form.isEmpty()) fm.put("who_form", who_form); if (!who_lemma.isEmpty()) fm.put("who_lemma", who_lemma); if (!verb_form.isEmpty()) fm.put("verb_form", verb_form); if (!verb_lemma.isEmpty()) fm.put("verb_lemma", verb_lemma); if (!type.isEmpty()) fm.put("type", type); if (!subject_form.isEmpty()) fm.put("subject_form", subject_form); if (!subject_lemmas.isEmpty()) fm.put("subject_lemmas", subject_lemmas); if (!subject_sensesID.isEmpty()) fm.put("subject_sensesID", subject_sensesID); if (!subject_semantic_lemma.isEmpty()) fm.put("subject_semantic_lemma", subject_semantic_lemma); if (!verb_form.isEmpty()) fm.put("verb_form", verb_form); if (!verb_lemmas.isEmpty()) fm.put("verb_lemmas", verb_lemmas); if (!verb_sensesID.isEmpty()) fm.put("verb_sensesID", verb_sensesID); if (!verb_semanticLemma.isEmpty()) fm.put("verb_semanticLemma", verb_semanticLemma); if (!complement_form.isEmpty()) fm.put("complement_form", complement_form); if (!complement_type.isEmpty()) fm.put("complement_type", complement_type); if (!degree.isEmpty()) fm.put("degree", degree); Annot annot = new Annot(); annot.startOff = startOffset; annot.endOff = endOffset + 1; annot.Name = nameNode; annot.fm = fm; annotations.add(annot); flag_inip = false; flag_endp = false; } catch (Exception e) { System.err.println(e.toString()); } } } } return annotations; } }