package org.basex.build.file; import static org.basex.build.file.MAB2.*; import static org.basex.util.Token.*; import java.io.IOException; import java.util.Arrays; import org.basex.build.BuildException; import org.basex.build.SingleParser; import org.basex.core.Prop; import org.basex.io.IO; import org.basex.io.IOFile; import org.basex.io.out.PrintOutput; import org.basex.io.random.DataAccess; import org.basex.util.Performance; import org.basex.util.TokenBuilder; import org.basex.util.Util; import org.basex.util.hash.TokenMap; import org.basex.util.hash.TokenObjMap; import org.basex.util.list.ByteList; /** * This class parses files in the MAB2 format * and sends events to the specified database builder. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public final class MAB2Parser extends SingleParser { /** Encoding of MAB2 input. */ private static final String ENCODING = "iso-8859-1"; /** Temporary token builder. */ private final ByteList buffer = new ByteList(); /** Subject assignments. */ private final TokenMap subjects = new TokenMap(); /** Media type assignments. */ private final TokenMap mediatypes = new TokenMap(); /** Language assignments. */ private final TokenMap languages = new TokenMap(); /** MedioVis ID assignments. */ private final TokenMap mvids = new TokenMap(); /** Optional lending numbers. */ private final TokenMap lendings = new TokenMap(); /** Optional lending status. */ private final TokenMap status = new TokenMap(); /** Image assignments. */ private final TokenMap posters = new TokenMap(); /** Genre assignments. */ private final TokenMap genres = new TokenMap(); /** Temporary build data. */ private final byte[][] sig = new byte[500][]; /** Temporary build data. */ private final byte[][] auth = new byte[50][]; /** Temporary build data. */ private final byte[][] inst = new byte[50][]; /** Flat database creation. */ private final boolean flat; /** Input to be parsed. */ private final DataAccess input; /** Temporary read position. */ private long off; /** Maximum id. */ private int maxid; /** * Constructor. * @param source source data * @param target database target * @param prop database properties * @throws IOException I/O exception */ public MAB2Parser(final IO source, final String target, final Prop prop) throws IOException { super(source, target); // set parser properties final ParserProp props = new ParserProp(prop.get(Prop.PARSEROPT)); flat = props.is(ParserProp.FLAT); input = new DataAccess(new IOFile(source.path())); } @Override public void parse() throws IOException { // read in indexes index(mediatypes, "mediatypes"); index(subjects, "subjects"); index(languages, "lang"); index(mvids, "mvids"); index(lendings, "lendings"); index(status, "status"); index(posters, "posters"); index(genres, "genres"); // find maximum id for(int i = 1; i <= mvids.size(); ++i) { final int id = toInt(mvids.value(i)); if(maxid < id) maxid = id; } // check beginning of input file if(input.read1() != '#' || input.read1() != '#' || input.read1() != '#') { throw new BuildException("Invalid MAB2 input (doesn't start with ###)"); } builder.startElem(LIBRARY, atts.reset()); // find file offsets of all titles final Performance p = new Performance(); // MAB2 entries final TokenObjMap<MAB2Entry> ids = new TokenObjMap<MAB2Entry>(); int i = 0; for(byte[] id; (id = id(input)) != null;) { final long pos = off; final byte[] par = par(input); final boolean child = par != null; final byte[] key = child ? par : id; MAB2Entry entry = ids.get(key); if(entry == null) { entry = new MAB2Entry(); ids.add(key, entry); } if(child) entry.add(pos); else entry.pos(pos); if(Util.debug) { if((++i & 0x7FFF) == 0) Util.err(" " + i + '\n'); else if((i & 0xFFF) == 0) Util.err("!"); else if((i & 0x3FF) == 0) Util.err("."); } } if(Util.debug) Util.err("\nParse Offsets (%): %/%\n", ids.size(), p, Performance.getMem()); // create all titles for(i = 1; i <= ids.size(); ++i) { final MAB2Entry entry = ids.value(i); final long pos = entry.pos; // check if top entry exists... final byte[] l = pos != 0 ? addEntry(input, pos, entry.size, null) : null; // loop through all children... for(int j = 0; j < entry.size; ++j) { addEntry(input, entry.children[j], 0, l); } if(entry.size != 0 && pos != 0 && !flat) builder.endElem(); } if(Util.debug) Util.err("\nCreate Titles: %/%\n", p, Performance.getMem()); builder.endElem(); // write the mediovis ids back to disk final PrintOutput out = new PrintOutput("mvids.dat"); for(i = 1; i <= mvids.size(); ++i) { out.print(mvids.key(i)); out.write('\t'); out.println(mvids.value(i)); } out.close(); } @Override public void close() throws IOException { input.close(); } /** * Returns the next id. * @param in input stream * @return id */ private byte[] id(final DataAccess in) { while(in.more()) { if(in.read1() != '\n') continue; final int n = in.read1(); if(n == '0' && in.read1() == '0' && in.read1() == '1') { off = in.cursor() - 3; return ident(in); } } return null; } /** * Returns the next parent id. * @param in input stream * @return id */ private static byte[] par(final DataAccess in) { while(in.more()) { if(in.read1() != '\n') continue; final int b1 = in.read1(); if(b1 == '#' || b1 == '\n') return null; if(b1 == '0' && in.read1() == '1' && in.read1() == '0' || b1 == '4' && in.read1() == '5' && in.read1() == '3') return ident(in); } return null; } /** * Returns the next text. * @param in input stream * @return next text */ private static byte[] ident(final DataAccess in) { in.read1(); int l = 0; for(byte b; (b = in.read1()) >= ' ';) CACHE[l++] = b; return Arrays.copyOf(CACHE, l); } /** Buffer. */ private static final byte[] CACHE = new byte[16]; /** * Gets all characters up to specified character. * @param in input stream * @param delim delimiter * @return byte array */ private byte[] find(final DataAccess in, final byte delim) { buffer.reset(); while(in.more()) { final byte c = in.read1(); if(c == delim) return buffer.toArray(); if(c < 0 || c >= ' ') buffer.add(c); } return null; } /** * Adds an entry. * @param in input stream * @param pos file offset to start from * @param sb number of subordinate titles * @param last last title * @return last title * @throws IOException I/O exception */ private byte[] addEntry(final DataAccess in, final long pos, final int sb, final byte[] last) throws IOException { /* Temporary build data. */ byte[] mvID = null; /* Temporary build data. */ byte[] bibID = null; /* Temporary build data. */ byte[] title = null; /* Temporary build data. */ byte[] description = null; /* Temporary build data. */ byte[] type = null; /* Temporary build data. */ byte[] language = null; /* Temporary build data. */ byte[] original = null; /* Temporary build data. */ byte[] subtitle = null; /* Temporary build data. */ byte[] town = null; /* Temporary build data. */ byte[] publisher = null; /* Temporary build data. */ byte[] year = null; /* Temporary build data. */ byte[] format = null; /* Temporary build data. */ byte[] details = null; /* Temporary build data. */ byte[] note = null; /* Temporary build data. */ byte[] isbn = null; /* Temporary build data. */ byte[] subject = null; /* Temporary build data. */ int nrSigs = 0; /* Temporary build data. */ int nrAuth = 0; /* Temporary build data. */ int nrInst = 0; /* Temporary build data. */ boolean shortTitle = false; // position disk cursor in.cursor(pos); // collect meta-data while(true) { final byte[] line = find(in, (byte) '\n'); final int l = line.length; if(l > 3) { if(line[0] == '#') continue; final int n = toInt(line, 0, 3); if(n == 1) { if(bibID == null) { bibID = string(line); mvID = mvids.get(bibID); if(mvID == null) { mvID = token(++maxid); mvids.add(bibID, mvID); } } } else if(n == 29) { type = mediatypes.get(num(line)); } else if(n == 37 && language == null) { language = language(line); } else if(n == 81) { title = string(line); shortTitle = true; } else if(n >= 100 && n < 200 && (n & 3) == 0) { auth[nrAuth++] = string(line); } else if(n >= 200 && n < 300 && (n & 3) == 0) { inst[nrInst++] = string(line); } else if(n == 304) { original = string(line); } else if(n == 310) { title = string(line); shortTitle = true; } else if(n == 331) { if(title == null) title = string(line); else if(shortTitle) description = string(line); } else if(n == 335) { subtitle = string(line); } else if(n == 340) { if(original == null) original = string(line); } else if(n == 359) { description = merge(description, string(line)); } else if(n == 410) { town = string(line); } else if(n == 412) { publisher = string(line); } else if(n == 425) { year = year(line); } else if(n == 433) { format = string(line); } else if(n == 501) { details = string(line); year = year2(details, year); } else if(n == 537) { note = string(line); } else if(n == 540) { isbn = string(line); } else if(n == 542) { isbn = string(line); } else if(n == 544) { sig[nrSigs++] = string(line); } else if(n == 700) { if(nrSigs == 0) sig[nrSigs++] = string(line); } } else { atts.reset(); atts.add(MV_ID, mvID); atts.add(BIB_ID, bibID); if(sb != 0 && !flat) atts.add(MAX, token(sb)); // merge super and sub titles if(last != null) { if(title == null) title = last; else if(!eq(last, title)) title = concat(last, SEMI, title); } // add line below to omit root nodes builder.startElem(MEDIUM, atts); add(TYPE, type); add(LANGUAGE, language); for(int s = 0; s < nrAuth; ++s) add(AUTHOR, auth[s]); for(int s = 0; s < nrInst; ++s) add(INSTITUTE, inst[s]); add(ORIGINAL, original); add(TITLE, title); add(SUBTITLE, subtitle); add(DESCRIPTION, description); add(TOWN, town); add(PUBLISHER, publisher); add(YEAR, year); add(FORMAT, format); add(DETAILS, details); add(NOTE, note); for(int s = 0; s < nrSigs; ++s) add(SIGNATURE, sig[s]); // actually: several subjects/lending numbers per medium.. for(int s = 0; s < nrSigs; ++s) { if(subject == null) subject = subjects.get(subject(sig[s])); } add(SUBJECT, subject); add(ISBN, isbn); add(POSTER, posters.get(bibID)); add(GENRE, genres.get(mvID)); add(STATUS, status.get(bibID)); add(LENDINGS, lendings.get(bibID)); if(sb == 0 || flat) builder.endElem(); return title; } } } /** * Adds a tag and a content node. * @param tag tag to be added * @param cont content to be added * @throws IOException I/O exception */ private void add(final byte[] tag, final byte[] cont) throws IOException { if(cont == null) return; builder.startElem(tag, atts.reset()); builder.text(utf8(cont, ENCODING)); builder.endElem(); } /** * Parses and returns a year. * @param line line to be parsed * @return byte array */ private static byte[] year(final byte[] line) { final byte[] n = new byte[4]; final int l = line.length; int c = 0; for(int i = 4; i < l; ++i) { final byte b = line[i]; if(b >= '0' && b <= '9') { n[c++] = b; if(c == 4) return n; } } return c != 0 ? Arrays.copyOf(n, c) : null; } /** * Looks up and returns four digits in the specified line * If no digits are found, the specified year is returned. * @param line line to be parsed * @param yr year * @return year */ private static byte[] year2(final byte[] line, final byte[] yr) { final int l = line.length; int i = -1; int j = -1; while(++i != l) { final byte b = line[i]; if(b < '0' || b > '9') { if(i - 5 == j) break; j = i; } } if(i - 5 != j) return yr; int oy = yr != null ? toInt(yr) : 0; if(oy >= 1400 && oy <= 1950) return yr; final byte[] y = Arrays.copyOfRange(line, j + 1, j + 5); oy = toInt(y); return oy >= 1500 && oy <= 2050 ? y : yr; } /** * Parses and returns a subject. * @param line line to be parsed * @return byte array */ private static byte[] subject(final byte[] line) { final byte[] n = new byte[3]; int i = -1; final int l = line.length; while(++i != l && line[i] < 'a'); int c = 0; while(i != l && line[i] >= 'a') { n[c++] = line[i++]; if(c == 3) return n; } return null; } /** * Corrects special characters in the language attribute. * @param token token to be corrected * @return corrected characters */ private byte[] language(final byte[] token) { final byte[] t = string(token); for(int i = 0; i < t.length; ++i) if(t[i] == '?' || t[i] == '$') t[i] = '+'; final TokenBuilder tb = new TokenBuilder(); for(final byte[] lang : split(t, '+')) { final byte[] l = languages.get(lang); if(tb.size() != 0) tb.add('+'); tb.add(l != null ? l : t); } return tb.finish(); } /** * Replaces some characters in the specified line. * @param line line to be modified * @return modified byte array */ private static byte[] string(final byte[] line) { final byte[] tmp = new byte[line.length - 4]; int c = 0; final int l = line.length; boolean space = false; for(int s = 4; s < l; ++s) { byte b = line[s]; // double cross if(b == -121) b = '+'; // delimiter else if(b == -84) b = ' '; else if(b == '<') b = '['; else if(b == '>') b = ']'; if(b == ' ' && (space || s == 4)) continue; space = b == ' '; tmp[c++] = b; } return c == tmp.length ? tmp : Arrays.copyOf(tmp, c); } /** * Replaces some characters in the specified line. * @param line line to be modified * @return modified byte array */ private static byte[] num(final byte[] line) { final int l = line.length; int s = 3; while(++s < l && line[s] == '0'); return Arrays.copyOfRange(line, s, line.length); } /** * Merges two byte arrays. * @param text1 first text * @param text2 second text * @return byte array */ private static byte[] merge(final byte[] text1, final byte[] text2) { return text1 == null ? text2 : concat(text1, token(". "), text2); } /** * Fills the specified hash with the file input. * @param hash hash to be filled * @param fn file to be read */ private void index(final TokenMap hash, final String fn) { try { final DataAccess in = new DataAccess(new IOFile(fn + ".dat")); while(true) { final byte[] key = find(in, (byte) '\t'); final byte[] val = find(in, (byte) '\n'); if(key == null) break; hash.add(key, val); } } catch(final IOException ex) { Util.debug(ex); } } /** * This is a simple data structure for storing MAB2 entries. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ static final class MAB2Entry { /** Children offsets. */ long[] children; /** File offset; 0 if no parent node exists. */ long pos; /** Number of children. */ int size; /** * Adds a child. * @param c child to be added */ void add(final long c) { if(children == null) children = new long[1]; else if(size == children.length) children = Arrays.copyOf(children, size << 1); children[size++] = c; } /** * Sets the file offset. * @param p file offset */ void pos(final long p) { pos = p; } } }