/* * Copyright (C) 2010, 2011. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 3 or * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ package uk.me.parabola.mkgmap.srt; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.CharsetEncoder; import java.util.ArrayList; import java.util.List; import uk.me.parabola.imgfmt.ExitException; import uk.me.parabola.imgfmt.app.srt.SRTFile; import uk.me.parabola.imgfmt.app.srt.Sort; import uk.me.parabola.imgfmt.fs.ImgChannel; import uk.me.parabola.imgfmt.sys.FileImgChannel; import uk.me.parabola.mkgmap.scan.SyntaxException; import uk.me.parabola.mkgmap.scan.TokType; import uk.me.parabola.mkgmap.scan.Token; import uk.me.parabola.mkgmap.scan.TokenScanner; /** * Read in a sort file from a text format. * * The file is in utf-8, regardless of the target codepage. * * The file should start with a codepage declaration, which determines the * target codepage for the sort. This can be followed by a description which is * added into the SRT file. * * The characters are listed in order arranged in a way that shows the strength of the * difference between the characters. These are: * * Primary difference - different letters (eg a and b) * Secondary difference - different accents (eg a and a-acute) * Tertiary difference - different case (eg a and A) * * The sort order section begins with the word 'code'. * * Primary differences are represented by the less-than separator. * Secondary differences are represented by the semi-colon separator. * Tertiary differences are represented by the comma separator. * * Characters are represented in <emphasis>unicode (utf-8)</emphasis> (the whole file must be in utf-8). * Or alternatively you can use a four hex-digit number. A few special punctuation characters must * be written that way to prevent them being mistaken for separators. * * Example * <pre> * # This is a comment * codepage 1252 * description "Example sort" * code a, A; â Â * < b, B * # Last two lines could be written: * # code a, A; â, Â < b, B * </pre> * * @author Steve Ratcliffe */ public class SrtTextReader { // States private static final int IN_INITIAL = 0; private static final int IN_CHARACTER = 1; private static final int IN_EXPAND = 2; // Data that is read in, the output of the reading operation private final Sort sort = new Sort(); private CharsetEncoder encoder; // Used during parsing. private int pos1; private int pos2; private int pos3; private int state; private String cflags = ""; public SrtTextReader(Reader r) throws IOException { this("stream", r); } private SrtTextReader(String filename) throws IOException { this(filename, new InputStreamReader(new FileInputStream(filename), "utf-8")); } private SrtTextReader(String filename, Reader r) throws IOException { read(filename, r); } /** * Find and read in the default sort description for the given codepage. */ public static Sort sortForCodepage(int codepage) { String name = "sort/cp" + codepage + ".txt"; InputStream is = Sort.class.getClassLoader().getResourceAsStream(name); if (is == null) { if (codepage == 1252) throw new ExitException("No sort description for code-page 1252 available"); Sort defaultSort = SrtTextReader.sortForCodepage(1252); defaultSort.setCodepage(codepage); defaultSort.setDescription("Default sort"); return defaultSort; } try { InputStreamReader r = new InputStreamReader(is, "utf-8"); SrtTextReader sr = new SrtTextReader(r); return sr.getSort(); } catch (IOException e) { return SrtTextReader.sortForCodepage(codepage); } } /** * Read in a file and save the information in a form that can be used * to compare strings. * @param filename The name of the file, used for display purposes. It need * not refer to a file that actually exists. * @param r The opened file or other readable source. * @throws SyntaxException If the format of the file is incorrect. */ public void read(String filename, Reader r) { TokenScanner scanner = new TokenScanner(filename, r); resetPos(); state = IN_INITIAL; while (!scanner.isEndOfFile()) { Token tok = scanner.nextToken(); // We deal with whole line comments here if (tok.isValue("#")) { scanner.skipLine(); continue; } switch (state) { case IN_INITIAL: initialState(scanner, tok); break; case IN_CHARACTER: characterState(scanner, tok); break; case IN_EXPAND: expandState(scanner, tok); break; } } sort.finish(); } /** * The initial state, looking for a variable to set or a command to change * the state. * @param scanner The scanner for more tokens. * @param tok The first token to process. */ private void initialState(TokenScanner scanner, Token tok) { String val = tok.getValue(); TokType type = tok.getType(); if (type == TokType.TEXT) { switch (val) { case "codepage": int codepage = scanner.nextInt(); sort.setCodepage(codepage); encoder = sort.getCharset().newEncoder(); break; case "description": sort.setDescription(scanner.nextWord()); break; case "id1": sort.setId1(scanner.nextInt()); break; case "id2": sort.setId2(scanner.nextInt()); break; case "multi": sort.setMulti(true); break; case "code": // The old name; use characters case "characters": if (encoder == null) throw new SyntaxException(scanner, "Missing codepage declaration before code"); state = IN_CHARACTER; scanner.skipSpace(); break; case "expand": state = IN_EXPAND; scanner.skipSpace(); break; default: throw new SyntaxException(scanner, "Unrecognised command " + val); } } } /** * Block consisting of characters and relations between them. * * The sort order is derived from this. * * @param scanner The scanner for more tokens. * @param tok The current token to process. */ private void characterState(TokenScanner scanner, Token tok) { String val = tok.getValue(); TokType type = tok.getType(); if (type == TokType.TEXT) { switch (val) { case "flags": scanner.validateNext("="); cflags = scanner.nextWord(); // TODO not yet break; case "pos": // Used to set the actual sort position value, not used any more scanner.validateNext("="); try { int newPos = Integer.decode(scanner.nextWord()); if (newPos < pos1) throw new SyntaxException(scanner, "cannot set primary position backwards, was " + pos1); pos1 = newPos; } catch (NumberFormatException e) { throw new SyntaxException(scanner, "invalid integer for position"); } break; case "pos2": // Used to set the actual sort position value, not used any more scanner.validateNext("="); pos2 = Integer.decode(scanner.nextWord()); break; case "pos3": // Used to set the actual sort position value, not used any more scanner.validateNext("="); pos3 = Integer.decode(scanner.nextWord()); break; case "code": // the old name, use 'characters' case "characters": advancePos(); break; case "expand": //scanner.pushToken(tok); state = IN_EXPAND; break; default: addCharacter(scanner, val); break; } } else if (type == TokType.SYMBOL) { switch (val) { case "=": break; case ",": pos3++; break; case ";": pos3 = 1; pos2++; break; case "<": advancePos(); break; default: addCharacter(scanner, val); break; } } } /** * Within an 'expand' command. The whole command is read before return, they can not span * lines. * * @param tok The first token after the keyword. */ private void expandState(TokenScanner scanner, Token tok) { String val = tok.getValue(); Code code = new Code(scanner, val).read(); String s = scanner.nextValue(); if (!s.equals("to")) throw new SyntaxException(scanner, "Expected the word 'to' in expand command"); List<Integer> expansionList = new ArrayList<>(); while (!scanner.isEndOfFile()) { Token t = scanner.nextRawToken(); if (t.isEol()) break; if (t.isWhiteSpace()) continue; Code r = new Code(scanner, t.getValue()).read(); expansionList.add(r.getBval()); } sort.addExpansion(code.getBval(), charFlags(code.getCval()), expansionList); state = IN_INITIAL; } /** * Add a character to the sort table. * @param scanner Input scanner, for line number information. * @param val A single character string containing the character to be added. This will * be either a single character which is the unicode representation of the character, or * two or more characters which is the hex representation of the code point in the target codepage. */ private void addCharacter(TokenScanner scanner, String val) { Code code = new Code(scanner, val).read(); setSortcode(code.getBval()); } /** * Set the sort code for the given 8-bit character. * @param ch The same character in unicode. */ private void setSortcode(int ch) { int flags = charFlags(ch); if (cflags.contains("0")) flags = 0; sort.add(ch, pos1, pos2, pos3, flags); this.cflags = ""; } /** * The flags that describe the kind of character. Known ones * are letter and digit. There may be others. * @param ch The actual character (unicode). * @return The flags that apply to it. */ private int charFlags(int ch) { int flags = 0; if (Character.isLetter(ch) && (Character.getType(ch) & Character.MODIFIER_LETTER) == 0) flags = 1; if (Character.isDigit(ch)) flags = 2; return flags; } /** * Reset the position fields to their initial values. */ private void resetPos() { pos1 = 0; pos2 = 0; pos3 = 0; } /** * Advance the major position value, resetting the minor position variables. */ private void advancePos() { if (pos1 == 0) pos1 = 1; else pos1 += pos2; pos2 = 1; pos3 = 1; } public Sort getSort() { return sort; } /** * Read in a sort description text file and create a SRT from it. * @param args First arg is the text input file, the second is the name of the output file. The defaults are * in.txt and out.srt. */ public static void main(String[] args) throws IOException { String infile = "in.txt"; if (args.length > 0) infile = args[0]; String outfile = "out.srt"; if (args.length > 1) outfile = args[1]; ImgChannel chan = new FileImgChannel(outfile, "rw"); SRTFile sf = new SRTFile(chan); SrtTextReader tr = new SrtTextReader(infile); Sort sort1 = tr.getSort(); sf.setSort(sort1); sf.write(); sf.close(); chan.close(); } /** * Helper to represent a code read from the file. * * You can write it in unicode, or as a hex number. * We work out what you wrote, and return both the code point in * the codepage and the unicode character form of the letter. */ private class Code { private final TokenScanner scanner; private final String val; private int cval; private int bval; public Code(TokenScanner scanner, String val) { this.scanner = scanner; this.val = val; } /** * Get the character encoded in the code-page encoding. * * It will be one byte for the format-9 code pages cp1252 etc. * @return A character encoded in the code-page. */ public int getBval() { return bval; } /** * Get the character in unicode. * * It will in general be a 2 byte value. * * @return The character expressed in unicode. */ public int getCval() { return cval; } public Code read() { try { if (val.length() == 1) { cval = val.charAt(0); } else { cval = Integer.parseInt(val, 16); } if (sort.isMulti()) { bval = cval; } else { CharBuffer cbuf = CharBuffer.wrap(new char[] {(char) cval}); ByteBuffer out = encoder.encode(cbuf); if (out.remaining() > 1) throw new SyntaxException(scanner, "more than one character resulted from conversion of " + val); bval = out.get() & 0xff; } } catch (NumberFormatException e) { throw new SyntaxException(scanner, "Not a valid hex number " + val); } catch (CharacterCodingException e) { throw new SyntaxException(scanner, "Character not valid in character set '" + val + "'"); } return this; } public String toString() { return String.format("%x", cval); } } }