diff --git a/core/src/main/java/org/luaj/vm2/libs/Utf8Lib.java b/core/src/main/java/org/luaj/vm2/libs/Utf8Lib.java new file mode 100644 index 00000000..ae1c9f63 --- /dev/null +++ b/core/src/main/java/org/luaj/vm2/libs/Utf8Lib.java @@ -0,0 +1,264 @@ +package org.luaj.vm2.libs; + +import org.luaj.vm2.LuaError; +import org.luaj.vm2.LuaString; +import org.luaj.vm2.LuaTable; +import org.luaj.vm2.LuaValue; +import org.luaj.vm2.Varargs; + +/** + * Lua 5.3 utf8 library. + */ +public class Utf8Lib extends TwoArgFunction { + + private static final LuaString CHARPATTERN = LuaValue.valueOf( + "[\u0000-\u007F\u00C2-\u00F4][\u0080-\u00BF]*"); + + public LuaValue call(LuaValue modname, LuaValue env) { + LuaTable utf8 = new LuaTable(); + utf8.set("char", new utf8_char()); + utf8.set("codes", new codes()); + utf8.set("codepoint", new codepoint()); + utf8.set("len", new len()); + utf8.set("offset", new offset()); + utf8.set("charpattern", CHARPATTERN); + env.set("utf8", utf8); + if (!env.get("package").isnil()) { + env.get("package").get("loaded").set("utf8", utf8); + } + return utf8; + } + + static final class utf8_char extends VarArgFunction { + public Varargs invoke(Varargs args) { + int n = args.narg(); + if (n == 0) { + return LuaValue.EMPTYSTRING; + } + StringBuilder builder = new StringBuilder(); + for (int i = 1; i <= n; i++) { + long codepoint = args.checklong(i); + appendCodePoint(builder, codepoint); + } + return LuaValue.valueOf(builder.toString()); + } + } + + static final class codepoint extends VarArgFunction { + public Varargs invoke(Varargs args) { + LuaString s = args.checkstring(1); + int len = s.rawlen(); + int start = relativeIndex(args.optint(2, 1), len); + int end = relativeIndex(args.optint(3, start), len); + if (start < 1 || start > len + 1) { + argerror(2, "out of range"); + } + if (end < start - 1 || end > len) { + argerror(3, "out of range"); + } + if (start > end) { + return NONE; + } + LuaValue[] values = new LuaValue[end - start + 1]; + int count = 0; + int pos = start - 1; + int limit = end; + while (pos < limit) { + Decoded decoded = decode(s, pos); + if (decoded.next > limit) { + throw new LuaError("invalid UTF-8 code"); + } + values[count++] = LuaValue.valueOf(decoded.codepoint); + pos = decoded.next; + } + if (pos != limit) { + throw new LuaError("invalid UTF-8 code"); + } + return LuaValue.varargsOf(values); + } + } + + static final class len extends VarArgFunction { + public Varargs invoke(Varargs args) { + LuaString s = args.checkstring(1); + int len = s.rawlen(); + int start = relativeIndex(args.optint(2, 1), len); + int end = relativeIndex(args.optint(3, -1), len); + if (start < 1 || start > len + 1) { + argerror(2, "out of range"); + } + if (end < start - 1 || end > len) { + argerror(3, "out of range"); + } + int pos = start - 1; + int count = 0; + int limit = end; + while (pos < limit) { + try { + Decoded decoded = decode(s, pos); + if (decoded.next > limit) { + return LuaValue.varargsOf(new LuaValue[] { NIL, LuaValue.valueOf(pos + 1) }); + } + pos = decoded.next; + count++; + } catch (LuaError e) { + return LuaValue.varargsOf(new LuaValue[] { NIL, LuaValue.valueOf(pos + 1) }); + } + } + if (pos != limit) { + return LuaValue.varargsOf(new LuaValue[] { NIL, LuaValue.valueOf(pos + 1) }); + } + return LuaValue.valueOf(count); + } + } + + static final class offset extends VarArgFunction { + public Varargs invoke(Varargs args) { + LuaString s = args.checkstring(1); + int n = args.checkint(2); + int len = s.rawlen(); + int i = args.narg() >= 3 ? relativeIndex(args.checkint(3), len) : (n >= 0 ? 1 : len + 1); + if (i < 1 || i > len + 1) { + argerror(3, "position out of range"); + } + if (n == 0) { + if (i == len + 1) { + if (len == 0) { + return NIL; + } + i = len; + } + while (i > 1 && isContinuation(s.luaByte(i - 1))) { + i--; + } + if (i <= len && isContinuation(s.luaByte(i - 1))) { + throw new LuaError("initial position is a continuation byte"); + } + return LuaValue.valueOf(i); + } + int pos = i; + if (n > 0) { + pos--; + while (n > 0) { + if (pos >= len) { + return NIL; + } + Decoded decoded = decode(s, pos); + pos = decoded.next; + n--; + } + return LuaValue.valueOf(pos + 1); + } + pos--; + while (n < 0) { + if (pos <= 0) { + return NIL; + } + pos--; + while (pos > 0 && isContinuation(s.luaByte(pos))) { + pos--; + } + if (isContinuation(s.luaByte(pos))) { + throw new LuaError("invalid UTF-8 code"); + } + n++; + } + return LuaValue.valueOf(pos + 1); + } + } + + static final class codes extends VarArgFunction { + public Varargs invoke(Varargs args) { + LuaValue arg = args.arg1(); + LuaString s = arg.checkstring(); + return LuaValue.varargsOf(new LuaValue[] { new codes_iter(s), s, LuaValue.ZERO }); + } + } + + static final class codes_iter extends VarArgFunction { + private final LuaString s; + + codes_iter(LuaString s) { + this.s = s; + } + + public Varargs invoke(Varargs args) { + int index = args.arg(2).optint(0); + if (index < 0 || index > s.rawlen()) { + return NONE; + } + if (index == s.rawlen()) { + return NONE; + } + Decoded decoded = decode(s, index); + return LuaValue.varargsOf(new LuaValue[] { + LuaValue.valueOf(index + 1), + LuaValue.valueOf(decoded.codepoint) + }); + } + } + + private static int relativeIndex(int index, int len) { + return index >= 0 ? index : len + index + 1; + } + + private static boolean isContinuation(int b) { + return (b & 0xC0) == 0x80; + } + + private static Decoded decode(LuaString s, int pos) { + int first = s.luaByte(pos); + if (first < 0x80) { + return new Decoded(first, pos + 1); + } + int needed; + int minCodePoint; + int codepoint; + if ((first & 0xE0) == 0xC0) { + needed = 1; + minCodePoint = 0x80; + codepoint = first & 0x1F; + } else if ((first & 0xF0) == 0xE0) { + needed = 2; + minCodePoint = 0x800; + codepoint = first & 0x0F; + } else if ((first & 0xF8) == 0xF0) { + needed = 3; + minCodePoint = 0x10000; + codepoint = first & 0x07; + } else { + throw new LuaError("invalid UTF-8 code"); + } + if (pos + needed >= s.rawlen()) { + throw new LuaError("invalid UTF-8 code"); + } + for (int i = 1; i <= needed; i++) { + int b = s.luaByte(pos + i); + if (!isContinuation(b)) { + throw new LuaError("invalid UTF-8 code"); + } + codepoint = (codepoint << 6) | (b & 0x3F); + } + if (codepoint < minCodePoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) { + throw new LuaError("invalid UTF-8 code"); + } + return new Decoded(codepoint, pos + needed + 1); + } + + private static void appendCodePoint(StringBuilder builder, long codepoint) { + if (codepoint < 0 || codepoint > 0x10FFFFL || (codepoint >= 0xD800L && codepoint <= 0xDFFFL)) { + throw new LuaError("value out of range"); + } + builder.appendCodePoint((int) codepoint); + } + + private static final class Decoded { + final int codepoint; + final int next; + + Decoded(int codepoint, int next) { + this.codepoint = codepoint; + this.next = next; + } + } +}