package edu.byu.cs.roots.opg.io; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.Calendar; import java.util.Date; import java.util.GregorianCalendar; import java.util.HashMap; import java.util.StringTokenizer; import edu.byu.cs.roots.opg.model.EventClass; import edu.byu.cs.roots.opg.model.FamilyClass; import edu.byu.cs.roots.opg.model.IndividualRecord; import edu.byu.cs.roots.opg.util.PlaceAbbr; // 1 BAPL for baptism living followed by 2 date, 2 place (EVENT) public class Parser{ static BufferedReader br; public static HashMap<String, IndividualRecord>individualmap; public static HashMap<String, FamilyClass> familymap; static EventClass nullEvent = new EventClass("","",""); static EventClass nullBapl = new EventClass("-1 ","",""); static HashMap<String, PlaceAbbr> abbrMap; //list of abbreviations static Calendar calendar = new GregorianCalendar(); ;//Calendar object used in parsing of dates public Parser(String filename)throws IOException{ familymap = new HashMap<String, FamilyClass>(); individualmap = new HashMap<String, IndividualRecord>(); setUpAbbreviationMaps(); if (filename == null) throw new IllegalArgumentException ("filename is null"); try { br = new BufferedReader(new FileReader(filename)); } catch (IOException e) { System.out.println("Can't open gedcom file: " + filename); return ; } } /* ===========================================================================================================*/ public static void gedparse(Parser p)throws IOException{ /* ===========================================================================================================*/ IndividualRecord indi; FamilyClass fam; //String line; //int nb=0, tb=0, lb=0, ni = 0; //int nb1=0, tb1=0, lb1=0; // This parser assumes that the gedcom file is organized as follows: // Header material // all individuals // all families // ending material // All gedcom files that I have observed follow this order, although the gedcom standard does not require it while((indi = Parser.nextIndi()) != null){ // Read in all individuals Parser.individualmap.put(indi.id, indi); // System.out.println("Indi:" + indi.givenName + "|" + indi.middleName + "|" + indi.lastName+ "|" + indi.id); //System.out.println(indi.givenName + "|" + indi.lastName+ "|" + indi.bapl.yearInt()+ "|" + indi.death.yearInt()); //ni++; } //System.out.println("nb = " + nb + "; tb = " + tb + "; lb = " + lb + "; ni= " + ni); //System.out.println("nb1 = " + nb1 + "; tb1 = " + tb1 + "; lb1 = " + lb1 + "; ni= " + ni); while((fam = Parser.nextFam()) != null){ // Read in all families if( (((fam.husband == null) || (fam.wife == null)) && fam.children.size() == 0) ) System.out.println("Family with one spouse and no kids: " + fam.id); Parser.familymap.put(fam.id, fam); //if(fam.id.compareTo("F12") == 0)System.out.println("Putting F12"); // System.out.println("Putting " + fam.id); for(int i=0; i < fam.children.size(); i++){ // Link children to parents indi = fam.children.get(i); if(indi == null) System.out.println("null indi " + fam.id); else{ indi.father = fam.husband; //if(indi.father != null)indi.father.nChildren++; indi.mother = fam.wife; //if(indi.mother != null)indi.mother.nChildren++; } } //}/ } // end while(fam ... //System.out.println("Out of nextFam"); p.close(); } // end gedParse /* ===========================================================================================================*/ static IndividualRecord nextIndi() throws IOException{ // Returns the next individual in the gedcom file. Returns null if there are no more indis. /* ===========================================================================================================*/ String line, type, code; br.mark(2000); while((line = br.readLine()) != null) { StringTokenizer t = new StringTokenizer(line, " @"); code = t.nextToken(); if(code.compareTo("0") == 0){ String tmpid = t.nextToken(); if(t.countTokens() > 0){ type = t.nextToken(); if(type.compareTo("INDI")==0) return parseIndi(tmpid); if(type.compareTo("FAM")==0){ br.reset(); return null; // no more indis in this file } } } // end "0" } // end while return null; } /* ===========================================================================================================*/ static IndividualRecord parseIndi(String tmpid) throws IOException{ /* ===========================================================================================================*/ IndividualRecord indi = new IndividualRecord(tmpid); String line, code, type; br.mark(2000); while((line = br.readLine()) != null) { StringTokenizer t = new StringTokenizer(line, " @"); code = t.nextToken(); if(code.compareTo("0") == 0){ br.reset(); return indi; } // end "0" if(code.compareTo("1") == 0){ // System.out.println(line); if(t.hasMoreTokens()){ type = t.nextToken(); if(type.compareTo("NAME")==0){ if(t.hasMoreTokens()){ indi.givenName = t.nextToken(); if(indi.givenName.charAt(0) == '/')indi.givenName = ""; } indi.middleName = getMiddleNames(line); t = new StringTokenizer(line, "/"); if(t.hasMoreTokens()){ t.nextToken(); indi.surName = ""; } if(t.hasMoreTokens()) indi.surName = t.nextToken(); br.mark(2000); if((line = br.readLine()) == null) return null; if(line.substring(2,6).compareTo("NSFX")==0) indi.surName += line.substring(6); else br.reset(); } // end NAME else if(type.compareTo("BAPL")==0){ //br.mark(3000); //indi.bapl = nextEvent(); } else if(type.compareTo("BIRT")==0){ br.mark(2000); indi.birth = nextEvent(); } else if(type.compareTo("DEAT")==0){ br.mark(2000); indi.death = nextEvent(); } // "DEAT" else if(type.compareTo("FAMC")==0) { //TODO check for Primary/No multiple family assignments (ie use null or something) indi.famc = t.nextToken(); } else if(type.compareTo("FAMS")==0) indi.fams.add(t.nextToken()); else if (type.compareTo("SEX")==0) { String sex = t.nextToken(); if (sex.equalsIgnoreCase("M")) indi.gender = 0; else indi.gender = 1; } else if (type.equals("OBJE")) { line = br.readLine(); t = new StringTokenizer(line); t.nextToken(); if (t.nextToken().equals("FORM")) { String fileType = t.nextToken(); if (fileType.equalsIgnoreCase("jpg")) { line = br.readLine(); t = new StringTokenizer(line); t.nextToken(); if (t.nextToken().equals("FILE")) { indi.photoPath = line.substring(7); indi.hasPhoto = true; } } } } } // end if t.hasMoreTokens } // end "1" br.mark(2000); } // end while System.out.println("returning null individual: " + line); return null; } /* ===========================================================================================================*/ static FamilyClass nextFam() throws IOException{ /* ===========================================================================================================*/ FamilyClass fam = null; String line, code, type; StringTokenizer t; br.mark(2000); while((line = br.readLine()) != null) { t = new StringTokenizer(line, " @"); if (!t.hasMoreTokens()) continue; code = t.nextToken(); if(code.compareTo("0") == 0){ String tmpid = t.nextToken(); if(t.countTokens() > 0){ type = t.nextToken(); if(type.compareTo("FAM")==0){ fam = new FamilyClass(tmpid); br.mark(2000); while((line = br.readLine()) != null) { t = new StringTokenizer(line, " @"); code = t.nextToken(); if(code.compareTo("0") == 0){ br.reset(); return fam; } else if(code.compareTo("1") == 0){ type = t.nextToken(); if(type.compareTo("CHIL")==0){ String cid = t.nextToken(); IndividualRecord child = individualmap.get(cid); fam.children.add(child); } if(type.compareTo("MARR")==0){ br.mark(2000); fam.marriage = nextEvent(); if (fam.husband != null) { fam.husband.marriage = fam.marriage; } else if (fam.wife != null) { fam.wife.marriage = fam.marriage; } else { //System.out.print("no spouses"); } } if(type.compareTo("HUSB")==0){ String id = t.nextToken(); fam.husband = individualmap.get(id); } if(type.compareTo("WIFE")==0) fam.wife = individualmap.get(t.nextToken()); } // "1" //else //br.reset(); } // end while } // end if FAM } // end if countTokens > 0 } // if "0" } // end while return null; } // end nextFam() /***********************************************************************************/ static EventClass nextEvent()throws IOException{ /***********************************************************************************/ EventClass e = new EventClass("","",""); String line, code; while((line = br.readLine()) != null) { StringTokenizer t = new StringTokenizer(line, " @"); code = t.nextToken(); if(code.compareTo("0") == 0 || code.compareTo("1") == 0){ br.reset(); return e; } if(code.compareTo("2") == 0) { if(line.length() < 6)System.out.println("Parser error: Expecting a line of at least six characters: " + line); if(line.substring(2,6).compareTo("DATE")==0) { e.date = line.substring(7); e.sortDate = parseDate(e.date); //br.mark(200); } else if(line.substring(2,6).compareTo("PLAC")==0) { if(line.length() > 6) { StringTokenizer place = new StringTokenizer(line.substring(7), ",()"); if(place.hasMoreTokens()) e.place1 = place.nextToken(); if ( e.place1.trim().equalsIgnoreCase("of") || (e.place1.trim().equalsIgnoreCase(".")) || (e.place1.trim().equalsIgnoreCase("<of")) || (e.place1.trim().equalsIgnoreCase("<")) ) e.place1 = place.nextToken(); while(place.hasMoreTokens()) { // Make the place parse correctly might need to be implemented String tempPlacePart = place.nextToken().trim(); if (!tempPlacePart.equals("USA") ) e.place2 = tempPlacePart; } e.place2 = Parser.abbreviatePlace(e.place2); } } /*else if(line.substring(2,6).compareTo("TEMP")==0){ if(line.length() > 7) e.temple = line.substring(7,line.length()); else e.temple = "Temple";*/ } // br.reset(); // end if "2" } // end while return e; } // end nextEvent /* ===========================================================================================================*/ static String getMiddleNames(String line){ /* ===========================================================================================================*/ int len = line.length(); if(len < 8) return ""; boolean lastCharacterWasASpace = true; int numberOfNames = 0; int middleNameStart = 0, middleNameEnd = 0; for(int i=7; i<len; i++){ if (line.charAt(i) == '/') { if (lastCharacterWasASpace) { lastCharacterWasASpace = false; numberOfNames++; if(numberOfNames == 2) middleNameStart = i; } break; } if(line.charAt(i) == ' '){ lastCharacterWasASpace = true; } else if (lastCharacterWasASpace){ lastCharacterWasASpace = false; numberOfNames++; if(numberOfNames == 2) middleNameStart = i; } } if(middleNameStart < 7) return ""; middleNameEnd = line.indexOf("/"); if(middleNameEnd < 7) return ""; return (line.substring(middleNameStart, middleNameEnd)).trim(); } /* ===========================================================================================================*/ void close()throws IOException{ /* ===========================================================================================================*/ br.close(); } /* ===========================================================================================================*/ void reOpen(String filename)throws IOException{ /* ===========================================================================================================*/ br.close(); try { br = new BufferedReader(new FileReader(filename)); } catch (IOException e) { System.out.println("Can't open gedcom file: " + filename); return ; } } /* ===========================================================================================================*/ static String abbreviatePlace(String str) { PlaceAbbr abbr = abbrMap.get(str.toLowerCase()); if (abbr != null && abbr.known < 4) { ++abbr.frequency; return abbr.abbr; } else return str; } /* ===========================================================================================================*/ static void setUpAbbreviationMaps() { //this method sets up all of the abbreviations for states, countries, etc. //all abbreviations are listed in lower case but they can map to any case abbrMap = new HashMap<String, PlaceAbbr>(); //US State and Territory abbreviations abbrMap.put("alabama", new PlaceAbbr("AL",2,2)); abbrMap.put("alaska", new PlaceAbbr("AK",2,2)); abbrMap.put("american samoa", new PlaceAbbr("AS",1,7)); abbrMap.put("arizona", new PlaceAbbr("AZ",2,1)); abbrMap.put("arkansas", new PlaceAbbr("AR",2,2)); abbrMap.put("california", new PlaceAbbr("CA",2,1)); abbrMap.put("colorado", new PlaceAbbr("CO",2,2)); PlaceAbbr ct = new PlaceAbbr("CT",2,2); abbrMap.put("connecticut", ct); abbrMap.put("conn", ct); abbrMap.put("conn.", ct); abbrMap.put("delaware", new PlaceAbbr("DE",2,3)); abbrMap.put("district of columbia", new PlaceAbbr("DC",2,1)); abbrMap.put("federated states of micronesia", new PlaceAbbr("FM",1,7)); abbrMap.put("florida", new PlaceAbbr("FL",2,1)); abbrMap.put("georgia", new PlaceAbbr("GA",2,2)); abbrMap.put("guam", new PlaceAbbr("GU",1,9)); abbrMap.put("hawaii", new PlaceAbbr("HI",2,2)); abbrMap.put("idaho", new PlaceAbbr("ID",2,2)); abbrMap.put("illinois", new PlaceAbbr("IL",2,2)); abbrMap.put("indiana", new PlaceAbbr("IN",2,2)); abbrMap.put("iowa", new PlaceAbbr("IA",2,2)); abbrMap.put("kansas", new PlaceAbbr("KS",2,2)); abbrMap.put("kentucky", new PlaceAbbr("KY",2,2)); abbrMap.put("louisiana", new PlaceAbbr("LA",2,2)); abbrMap.put("maine", new PlaceAbbr("ME",2,3)); abbrMap.put("marshall islands", new PlaceAbbr("MH",1,7)); abbrMap.put("maryland", new PlaceAbbr("MD",2,2)); PlaceAbbr ma = new PlaceAbbr("MA",2,2); abbrMap.put("massachusetts", ma); abbrMap.put("mass.", ma); abbrMap.put("michigan", new PlaceAbbr("MI",2,2)); abbrMap.put("minnesota", new PlaceAbbr("MN",2,2)); abbrMap.put("mississippi", new PlaceAbbr("MS",2,3)); abbrMap.put("missouri", new PlaceAbbr("MO",2,3)); abbrMap.put("montana", new PlaceAbbr("MT",2,2)); abbrMap.put("nebraska", new PlaceAbbr("NE",2,2)); abbrMap.put("nevada", new PlaceAbbr("NV",2,2)); abbrMap.put("new hampshire", new PlaceAbbr("NH",2,2)); abbrMap.put("new jersey", new PlaceAbbr("NJ",2,2)); abbrMap.put("new mexico", new PlaceAbbr("NM",2,2)); abbrMap.put("new york", new PlaceAbbr("NY",2,1)); abbrMap.put("north carolina", new PlaceAbbr("NC",2,2)); abbrMap.put("north dakota", new PlaceAbbr("ND",2,2)); abbrMap.put("northern mariana islands", new PlaceAbbr("MA",1,9)); abbrMap.put("ohio", new PlaceAbbr("OH",2,2)); abbrMap.put("oklahoma", new PlaceAbbr("OK",2,2)); abbrMap.put("oregon", new PlaceAbbr("OR",2,2)); abbrMap.put("palau", new PlaceAbbr("PW",1,9)); abbrMap.put("pennsylvania", new PlaceAbbr("PA",2,2)); abbrMap.put("puerto rico", new PlaceAbbr("PR",1,5)); abbrMap.put("rhode island", new PlaceAbbr("RI",2,2)); abbrMap.put("south carolina", new PlaceAbbr("SC",2,2)); abbrMap.put("south dakota", new PlaceAbbr("SD",2,2)); abbrMap.put("tennessee", new PlaceAbbr("TN",2,2)); abbrMap.put("texas", new PlaceAbbr("TX",2,1)); abbrMap.put("utah", new PlaceAbbr("UT",2,2)); abbrMap.put("vermont", new PlaceAbbr("VT",2,2)); abbrMap.put("virgin islands", new PlaceAbbr("VI",1,5)); abbrMap.put("virginia", new PlaceAbbr("VA",2,2)); abbrMap.put("washington", new PlaceAbbr("WA",2,2)); abbrMap.put("west virginia", new PlaceAbbr("WV",2,2)); abbrMap.put("wisconsin", new PlaceAbbr("WI",2,2)); abbrMap.put("wyoming", new PlaceAbbr("WY",2,2)); //country abbreviations //North America PlaceAbbr usa = new PlaceAbbr("USA",1,0); abbrMap.put("united states", usa); abbrMap.put("united states of america", usa); abbrMap.put("us", usa); abbrMap.put("usa", usa); abbrMap.put("canada", new PlaceAbbr("CAN",1,1)); PlaceAbbr mexico = new PlaceAbbr("MEX", 1,1); abbrMap.put("mexico", mexico); abbrMap.put("m�xico", mexico); //Europe abbrMap.put("albania", new PlaceAbbr("ALB",1,5)); abbrMap.put("andorra", new PlaceAbbr("AND",1,8)); abbrMap.put("austria", new PlaceAbbr("AUT",1,7)); abbrMap.put("oesterreich", new PlaceAbbr("AUT",1,7)); abbrMap.put("belarus", new PlaceAbbr("BLR",2,7)); abbrMap.put("belgium", new PlaceAbbr("BEL",2,5)); PlaceAbbr bosnia = new PlaceAbbr("BIH",2,9); abbrMap.put("Bosnia", bosnia); abbrMap.put("bosnia and herzegovinia", bosnia); abbrMap.put("bosnia hercegovina", bosnia); abbrMap.put("bulgaria", new PlaceAbbr("BGR",2,7)); //croatia?? abbrMap.put("cyprus", new PlaceAbbr("CYP",2,7)); abbrMap.put("denmark", new PlaceAbbr("DNK",2,6)); PlaceAbbr england = new PlaceAbbr("ENG",1,2); abbrMap.put("england", england); abbrMap.put("engl", england); abbrMap.put("engl.", england); abbrMap.put("eng", england); abbrMap.put("eng.", england); abbrMap.put("france", new PlaceAbbr("FRA",2,4)); abbrMap.put("germany", new PlaceAbbr("GER",2,4));//ISO code is "DEU" abbrMap.put("holland", new PlaceAbbr("HOL",1,4)); abbrMap.put("italy", new PlaceAbbr("ITA",2,5)); abbrMap.put("netherlands", new PlaceAbbr("NLD",1,6)); abbrMap.put("norway", new PlaceAbbr("NOR",2,5)); abbrMap.put("russia", new PlaceAbbr("RUS",1,5)); PlaceAbbr ussr = new PlaceAbbr("USSR",1,1); abbrMap.put("soviet union", ussr); abbrMap.put("union of soviet socialist republics", ussr); abbrMap.put("spain", new PlaceAbbr("ESP",2,8)); abbrMap.put("sweden", new PlaceAbbr("SWE",2,5)); abbrMap.put("switzerland", new PlaceAbbr("CHE",1,9)); abbrMap.put("united kingdom",new PlaceAbbr("UK",1,1)); //abbrMap.put("", new PlaceAbbr("", 0,0)); //Austrailia and Pacific Islands (Oceania) abbrMap.put("austraila", new PlaceAbbr("AUS", 1,2)); //Asia //South America //Africa //continents abbrMap.put("north america", new PlaceAbbr("N.America",0,0)); abbrMap.put("south america", new PlaceAbbr("S.America",0,0)); abbrMap.put("europe", new PlaceAbbr("EUR",0,0)); abbrMap.put("asia", new PlaceAbbr("Asia",0,0)); abbrMap.put("africa", new PlaceAbbr("Africa",0,0)); abbrMap.put("oceania", new PlaceAbbr("Oceania",0,10)); } /* ===========================================================================================================*/ /* * This method takes a string and attempts to create a Date from it. * It returns null if not possible */ public static Date parseDate(String str) { Integer year, month, day; boolean AD = true; year = month = day = null; StringTokenizer t = new StringTokenizer(str, " \t\n\r\f,/-<>()"); while (t.hasMoreTokens()) { String temp = t.nextToken().toLowerCase(); try { int num = Integer.parseInt(temp); if (year == null) { if (day != null) year = num; else if (num > 31) year = num; else day = num; } else if (day == null) { if (num <= 31 && num > 0) day = num; } } catch (NumberFormatException e) { //assume token is a string //see if it's a month (must be 3 characters or longer) if (temp.length() > 2 && month == null) { String monthAbbr = temp.substring(0,3); for (int i = 0; i < months.length; ++i) { if (monthAbbr.equals(months[i].toLowerCase()) ) { month = i; break; } } } //check to see if it's BC or B.C. if (temp.equals("bc") || temp.equals("b.c.")) AD = false; //if it's not a month or era indicator, then ignore it } } if (year == null && day != null) year = day; //if year is missing, return null if (year == null) return null; //if a field is missing, then substitute the possible midpoint for lower fields // ie. if month is missing, set month & day corresponding to midpoint of year calendar.set(Calendar.YEAR, year); if (!AD && year > 0) calendar.set(Calendar.ERA, GregorianCalendar.BC); else calendar.set(Calendar.ERA, GregorianCalendar.AD); if (month == null) { calendar.set(Calendar.DAY_OF_YEAR, calendar.getActualMaximum(Calendar.DAY_OF_YEAR)/2); } else if (day == null) { calendar.set(Calendar.MONTH, month); calendar.set(Calendar.DAY_OF_MONTH, calendar.getActualMaximum(Calendar.DAY_OF_MONTH)/2); } else { calendar.set(Calendar.MONTH, month); calendar.set(Calendar.DAY_OF_MONTH, day); } return calendar.getTime(); } /* ===========================================================================================================*/ public static final String[] months = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; /* ===========================================================================================================*/ }