From 534e69fcfbbb0f0d76890a92a082a44bd3c6eb9c Mon Sep 17 00:00:00 2001 From: Ian Farmer Date: Tue, 18 Sep 2007 04:18:39 +0000 Subject: [PATCH] Add pattern matcher implementation ported from C version of Lua. Still work to do: (1) Replace use of recursion with while/continue (2) Remove as many array bounds checks as possible --- .../java/lua/addon/luacompat/StrLib.java | 372 +++++++++++++++++- src/main/java/lua/value/LString.java | 12 + src/test/java/lua/LuaJTest.java | 4 + 3 files changed, 382 insertions(+), 6 deletions(-) diff --git a/src/addon/java/lua/addon/luacompat/StrLib.java b/src/addon/java/lua/addon/luacompat/StrLib.java index 3ce68a72..3cb693ed 100644 --- a/src/addon/java/lua/addon/luacompat/StrLib.java +++ b/src/addon/java/lua/addon/luacompat/StrLib.java @@ -10,7 +10,7 @@ public class StrLib { * string.byte (s [, i [, j]]) * * Returns the internal numerical codes of the - * characters s[i], s[i+1], ···, s[j]. The default value for i is 1; the + * characters s[i], s[i+1], ..., s[j]. The default value for i is 1; the * default value for j is i. * * Note that numerical codes are not necessarily portable across platforms. @@ -30,7 +30,7 @@ public class StrLib { } /** - * string.char (···) + * string.char (...) * * Receives zero or more integers. Returns a string with length equal * to the number of arguments, in which each character has the internal @@ -83,7 +83,7 @@ public class StrLib { } /** - * string.format (formatstring, ···) + * string.format (formatstring, ...) * * Returns a formatted version of its variable number of arguments following * the description given in its first argument (which must be a string). @@ -207,9 +207,45 @@ public class StrLib { /** * string.match (s, pattern [, init]) * - * Looks for the first match of pattern in the string s. If it finds one, then match returns the captures from the pattern; otherwise it returns nil. If pattern specifies no captures, then the whole match is returned. A third, optional numerical argument init specifies where to start the search; its default value is 1 and may be negative. + * Looks for the first match of pattern in the string s. If it finds one, + * then match returns the captures from the pattern; otherwise it returns + * nil. If pattern specifies no captures, then the whole match is returned. + * A third, optional numerical argument init specifies where to start the + * search; its default value is 1 and may be negative. */ - static void match( VM vm ) { + static void match( VM vm ) { + LString s = vm.getArgAsLuaString( 0 ); + LString pat = vm.getArgAsLuaString( 1 ); + int init = vm.getArgCount() > 2 ? vm.getArgAsInt( 2 ) : 1; + + if ( init > 0 ) { + init = Math.min( init - 1, s.length() ); + } else if ( init < 0 ) { + init = Math.max( 0, s.length() + init ); + } + + MatchState ms = new MatchState( vm, s, pat ); + + // TODO: check if pattern contains special characters, + // if not do a simpler search. + boolean anchor = false; + int poff = 0; + if ( pat.charAt( 0 ) == '^' ) { + anchor = true; + poff = 1; + } + + int soff = init; + do { + int res; + ms.reset(); + if ( ( res = ms.match( soff, poff ) ) != -1 ) { + ms.push_captures( true, soff, res ); + return; + } + } while ( soff++ < s.length() && !anchor ); + + vm.setResult( LNil.NIL ); } /** @@ -294,5 +330,329 @@ public class StrLib { } - + // Pattern matching implementation + + private static final int L_ESC = '%'; + private static final int MAX_CAPTURES = 32; + + private static final int CAP_UNFINISHED = -1; + private static final int CAP_POSITION = -2; + + private static class MatchState { + final LString s; + final LString p; + final VM vm; + int level; + int[] cinit; + int[] clen; + + MatchState( VM vm, LString s, LString pattern ) { + this.s = s; + this.p = pattern; + this.vm = vm; + this.level = 0; + this.cinit = new int[ MAX_CAPTURES ]; + this.clen = new int[ MAX_CAPTURES ]; + } + + void reset() { + level = 0; + } + + void push_captures( boolean wholeMatch, int soff, int end ) { + vm.setResult(); + int nlevels = ( this.level == 0 && wholeMatch ) ? 1 : this.level; + for ( int i = 0; i < nlevels; ++i ) { + push_onecapture( i, soff, end ); + } + } + + private void push_onecapture( int i, int soff, int end ) { + if ( i >= this.level ) { + if ( i == 0 ) { + vm.push( s.substring( soff, end ) ); + } + } else { + int l = clen[i]; + if ( l == CAP_UNFINISHED ) { + vm.luaL_error( "unfinished capture" ); + } + if ( l == CAP_POSITION ) { + vm.push( new LInteger( cinit[i] + 1 ) ); + } else { + int begin = cinit[i]; + vm.push( s.substring( begin, begin + l ) ); + } + } + } + + private int check_capture( int l ) { + l -= '1'; + if ( l < 0 || l >= level || this.clen[l] == CAP_UNFINISHED ) { + vm.luaL_error("invalid capture index"); + } + return l; + } + + private int capture_to_close() { + int level = this.level; + for ( level--; level >= 0; level-- ) + if ( clen[level] == CAP_UNFINISHED ) + return level; + vm.push("invalid pattern capture"); + vm.lua_error(); + return 0; + } + + int classend( int poffset ) { + switch ( p.charAt( poffset++ ) ) { + case L_ESC: + if ( poffset == p.length() ) { + vm.push( "malformed pattern (ends with %)" ); + vm.lua_error(); + } + return poffset + 1; + + case '[': + if ( p.charAt( poffset ) == '^' ) poffset++; + do { + if ( poffset == p.length() ) { + vm.push( "malformed pattern (missing ])" ); + vm.lua_error(); + } + if ( p.charAt( poffset++ ) == L_ESC && poffset != p.length() ) + poffset++; + } while ( p.charAt( poffset ) != ']' ); + return poffset + 1; + default: + return poffset; + } + } + + static boolean isalpha( int c ) { + return ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ); + } + + static boolean match_class( int c, int cl ) { + boolean res; + switch ( Character.toLowerCase( c ) ) { + case 'a': res = isalpha( c ); break; + case 'd': res = Character.isDigit( (char) c ); break; + case 'l': res = Character.isLowerCase( (char) c ); break; + case 'u': res = Character.isUpperCase( (char) c ); break; + case 'z': res = ( c == 0 ); break; + case 'c': + case 'p': + case 's': + case 'w': + case 'x': + throw new RuntimeException("match: unimplemented: %" + (char)cl ); + default: return cl == c; + } + return ( Character.isLowerCase( (char) cl ) ? res : !res ); + } + + boolean matchbracketclass( int c, int poff, int ec ) { + boolean sig = true; + if ( p.charAt( poff + 1 ) == '^' ) { + sig = false; + poff++; + } + while ( ++poff < ec ) { + if ( p.charAt( poff ) == L_ESC ) { + poff++; + if ( match_class( c, p.charAt( poff ) ) ) + return sig; + } + else if ( ( p.charAt( poff + 1 ) == '-' ) && ( poff + 2 < ec ) ) { + poff += 2; + if ( p.charAt( poff - 2 ) <= c && c <= p.charAt( poff ) ) + return sig; + } + else if ( p.charAt( poff ) == c ) return sig; + } + return !sig; + } + + boolean singlematch( int c, int poff, int ep ) { + switch ( p.charAt( poff ) ) { + case '.': return true; + case L_ESC: return match_class( c, p.charAt( poff + 1 ) ); + case '[': return matchbracketclass( c, poff, ep - 1 ); + default: return p.charAt( poff ) == c; + } + } + + /** + * Perform pattern matching. If there is a match, returns offset into s + * where match ends, otherwise returns -1. + */ + int match( int soffset, int poffset ) { + // Java doesn't have goto -- we could wrap this in a while (true) + // and use labeled breaks instead, potentially. For now just use + // recursion - of course, this won't work very well, because java doesn't + // do tail recursion optimization - see: + // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4726340 + // +// init: + // Check if we are at the end of the pattern - + // equivalent to the '\0' case in the C version, but our pattern + // string is not NUL-terminated. + if ( poffset == p.length() ) + return soffset; + switch ( p.charAt( poffset ) ) { + case '(': + if ( p.charAt( poffset + 1 ) == ')' ) + return start_capture( soffset, poffset + 2, CAP_POSITION ); + else + return start_capture( soffset, poffset + 1, CAP_UNFINISHED ); + case ')': + return end_capture( soffset, poffset + 1 ); + case L_ESC: + switch ( p.charAt( poffset+1 ) ) { + case 'b': + soffset = matchbalance( soffset, poffset + 2 ); + if ( soffset == -1 ) return -1; + // poffset += 4; goto init; + return match( soffset, poffset + 4 ); + case 'f': { + poffset += 2; + if ( p.charAt( poffset ) != '[' ) { + vm.push("Missing [ after %f in pattern"); + vm.lua_error(); + } + int ep = classend( poffset ); + int previous = ( soffset == 0 ) ? -1 : s.charAt( soffset - 1 ); + if ( matchbracketclass( previous, poffset, ep - 1 ) || + matchbracketclass( s.charAt( soffset ), poffset, ep - 1 ) ) + return -1; + // poffset = ep; goto init + return match( soffset, ep ); + } + default: { + int c = p.charAt( poffset + 1 ); + if ( Character.isDigit( (char) c ) ) { + soffset = match_capture( soffset, c ); + if ( soffset == -1 ) + return -1; + // poffset += 2; goto init; + return match( soffset, poffset + 2 ); + } + return match2( soffset, poffset ); // XXX: better name + } + } + case '$': + if ( poffset + 1 == p.length() ) + return ( soffset == s.length() ) ? soffset : -1; + return match2( soffset, poffset ); + } + return match2( soffset, poffset ); + } + + int match2( int soffset, int poffset ) { + int ep = classend( poffset ); + boolean m = soffset < s.length() && singlematch( s.charAt( soffset ), poffset, ep ); + int pc = ( ep < p.length() ) ? p.charAt( ep ) : '\0'; + + switch ( pc ) { + case '?': + int res; + if ( m && ( ( res = match( soffset + 1, ep + 1 ) ) != -1 ) ) + return res; + // p = ep + 1; goto init; + return match( soffset, ep + 1 ); + case '*': + return max_expand( soffset, poffset, ep ); + case '+': + return ( m ? max_expand( soffset + 1, poffset, ep ) : -1 ); + case '-': + return min_expand( soffset, poffset, ep ); + default: + if ( !m ) return -1; + // s++; p = ep; goto init; + return match( soffset+1, ep ); + } + } + + int max_expand( int soff, int poff, int ep ) { + int i = 0; + while ( soff + i < s.length() && + singlematch( s.charAt( soff + i ), poff, ep ) ) + i++; + while ( i >= 0 ) { + int res = match( soff + i, ep + 1 ); + if ( res != -1 ) + return res; + i--; + } + return -1; + } + + int min_expand( int soff, int poff, int ep ) { + for ( ;; ) { + int res = match( soff, ep + 1 ); + if ( res != -1 ) + return res; + else if ( soff < s.length() && singlematch( s.charAt( soff ), poff, ep ) ) + soff++; + else return -1; + } + } + + int start_capture( int soff, int poff, int what ) { + int res; + int level = this.level; + if ( level >= MAX_CAPTURES ) { + vm.push( "too many captures" ); + vm.lua_error(); + } + cinit[ level ] = soff; + clen[ level ] = what; + this.level = level + 1; + if ( ( res = match( soff, poff ) ) == -1 ) + this.level--; + return res; + } + + int end_capture( int soff, int poff ) { + int l = capture_to_close(); + int res; + clen[l] = soff - cinit[l]; + if ( ( res = match( soff, poff ) ) == -1 ) + clen[l] = CAP_UNFINISHED; + return res; + } + + int match_capture( int soff, int l ) { + l = check_capture( l ); + int len = clen[ l ]; + if ( ( s.length() - soff ) >= len && + LString.equals( s, cinit[l], s, soff, len ) ) + return soff + len; + else + return -1; + } + + int matchbalance( int soff, int poff ) { + final int plen = p.length(); + if ( poff == plen || poff + 1 == plen ) { + vm.push( "unbalanced pattern" ); + vm.lua_error(); + } + if ( s.charAt( soff ) != p.charAt( poff ) ) + return -1; + else { + int b = p.charAt( poff ); + int e = p.charAt( poff + 1 ); + int cont = 1; + while ( ++soff < s.length() ) { + if ( s.charAt( soff ) == e ) { + if ( --cont == 0 ) return soff + 1; + } + else if ( s.charAt( soff ) == b ) cont++; + } + } + return -1; + } + } } diff --git a/src/main/java/lua/value/LString.java b/src/main/java/lua/value/LString.java index 56f902fb..67945b9c 100644 --- a/src/main/java/lua/value/LString.java +++ b/src/main/java/lua/value/LString.java @@ -118,6 +118,12 @@ public class LString extends LValue { return new LString( m_bytes, m_offset + beginIndex, endIndex - beginIndex ); } + public int charAt( int index ) { + if ( index < 0 || index >= m_length ) + throw new IndexOutOfBoundsException(); + return (int)m_bytes[ index ] & 0x0FF; + } + public static LString valueOf( double d ) { return new LString( String.valueOf( d ) ); } @@ -275,7 +281,13 @@ public class LString extends LValue { return s_stringMT; } + public static boolean equals( LString a, int i, LString b, int j, int n ) { + return equals( a.m_bytes, a.m_offset + i, b.m_bytes, b.m_offset + j, n ); + } + public static boolean equals( byte[] a, int i, byte[] b, int j, int n ) { + if ( a.length < i + n || b.length < j + n ) + return false; final int imax = i + n; final int jmax = j + n; while ( i < imax && j < jmax ) { diff --git a/src/test/java/lua/LuaJTest.java b/src/test/java/lua/LuaJTest.java index 319b3fea..d9b73da8 100644 --- a/src/test/java/lua/LuaJTest.java +++ b/src/test/java/lua/LuaJTest.java @@ -76,6 +76,10 @@ public class LuaJTest extends TestCase { runTest( "setlist" ); } + public void testStrLib() throws IOException, InterruptedException { + runTest( "strlib" ); + } + public void testType() throws IOException, InterruptedException { runTest( "type" ); }