From bd6dd58a2de9e803d7d08051ae3d8369ac1827f9 Mon Sep 17 00:00:00 2001 From: Ian Farmer Date: Tue, 18 Sep 2007 05:17:09 +0000 Subject: [PATCH] String related improvements: (1) Fix string.char and string.byte (argument indexing was off by one) (2) Implement string.find (3) Change calls to charAt, which does array bounds checking, in the pattern matcher with calls to luaByte, which does not. (4) Replace use of recursion with while/continue in pattern matcher. --- .../java/lua/addon/luacompat/StrLib.java | 296 +++++++++--------- src/main/java/lua/value/LString.java | 29 +- src/test/res/strlib.lua | 15 + src/test/res/strlib.luac | Bin 0 -> 982 bytes 4 files changed, 196 insertions(+), 144 deletions(-) create mode 100644 src/test/res/strlib.lua create mode 100644 src/test/res/strlib.luac diff --git a/src/addon/java/lua/addon/luacompat/StrLib.java b/src/addon/java/lua/addon/luacompat/StrLib.java index 3cb693ed..a7a3d93a 100644 --- a/src/addon/java/lua/addon/luacompat/StrLib.java +++ b/src/addon/java/lua/addon/luacompat/StrLib.java @@ -18,9 +18,9 @@ public class StrLib { * @param vm the calling vm */ static void byte_( VM vm ) { - LString ls = vm.getArgAsLuaString(1); - int i = vm.getArgAsInt(2); - int j = vm.getArgAsInt(3); + LString ls = vm.getArgAsLuaString(0); + int i = vm.getArgAsInt(1); + int j = vm.getArgAsInt(2); int n = ls.length(); i = Math.max(1, i); j = Math.min(n, (j==0? i: j)); @@ -43,8 +43,8 @@ public class StrLib { public static void char_( VM vm) { int nargs = vm.getArgCount(); byte[] bytes = new byte[nargs]; - for ( int i=1; i<=nargs; i++ ) - vm.getArgAsInt(i); + for ( int i=0; i 2 ? vm.getArgAsInt( 2 ) : 1; - - if ( init > 0 ) { - init = Math.min( init - 1, s.length() ); - } else if ( init < 0 ) { - init = Math.max( 0, s.length() + init ); - } - - MatchState ms = new MatchState( vm, s, pat ); - - // TODO: check if pattern contains special characters, - // if not do a simpler search. - boolean anchor = false; - int poff = 0; - if ( pat.charAt( 0 ) == '^' ) { - anchor = true; - poff = 1; - } - - int soff = init; - do { - int res; - ms.reset(); - if ( ( res = ms.match( soff, poff ) ) != -1 ) { - ms.push_captures( true, soff, res ); - return; - } - } while ( soff++ < s.length() && !anchor ); - - vm.setResult( LNil.NIL ); + str_find_aux( vm, false ); } - + /** * string.rep (s, n) * @@ -328,11 +297,65 @@ public class StrLib { static void upper( VM vm ) { vm.setResult( new LString( vm.getArgAsString(1).toUpperCase() ) ); } - - + + /** + * This utility method implements both string.find and string.match. + */ + static void str_find_aux( VM vm, boolean find ) { + LString s = vm.getArgAsLuaString( 0 ); + LString pat = vm.getArgAsLuaString( 1 ); + int init = vm.getArgCount() > 2 ? vm.getArgAsInt( 2 ) : 1; + + if ( init > 0 ) { + init = Math.min( init - 1, s.length() ); + } else if ( init < 0 ) { + init = Math.max( 0, s.length() + init ); + } + + boolean fastMatch = find && ( vm.getArgAsBoolean( 3 ) || pat.indexOfAny( SPECIALS ) == -1 ); + vm.setResult(); + + if ( fastMatch ) { + int result = s.indexOf( pat, init ); + if ( result != -1 ) { + vm.push( result + 1 ); + vm.push( result + pat.length() ); + return; + } + } else { + MatchState ms = new MatchState( vm, s, pat ); + + boolean anchor = false; + int poff = 0; + if ( pat.luaByte( 0 ) == '^' ) { + anchor = true; + poff = 1; + } + + int soff = init; + do { + int res; + ms.reset(); + if ( ( res = ms.match( soff, poff ) ) != -1 ) { + if ( find ) { + vm.push( soff + 1 ); + vm.push( res ); + ms.push_captures( false, soff, res ); + } else { + ms.push_captures( true, soff, res ); + } + return; + } + } while ( soff++ < s.length() && !anchor ); + } + + vm.setResult( LNil.NIL ); + } + // Pattern matching implementation private static final int L_ESC = '%'; + private static final LString SPECIALS = new LString("^$*+?.([%-"); private static final int MAX_CAPTURES = 32; private static final int CAP_UNFINISHED = -1; @@ -360,7 +383,6 @@ public class StrLib { } void push_captures( boolean wholeMatch, int soff, int end ) { - vm.setResult(); int nlevels = ( this.level == 0 && wholeMatch ) ? 1 : this.level; for ( int i = 0; i < nlevels; ++i ) { push_onecapture( i, soff, end ); @@ -405,7 +427,7 @@ public class StrLib { } int classend( int poffset ) { - switch ( p.charAt( poffset++ ) ) { + switch ( p.luaByte( poffset++ ) ) { case L_ESC: if ( poffset == p.length() ) { vm.push( "malformed pattern (ends with %)" ); @@ -414,15 +436,15 @@ public class StrLib { return poffset + 1; case '[': - if ( p.charAt( poffset ) == '^' ) poffset++; + if ( p.luaByte( poffset ) == '^' ) poffset++; do { if ( poffset == p.length() ) { vm.push( "malformed pattern (missing ])" ); vm.lua_error(); } - if ( p.charAt( poffset++ ) == L_ESC && poffset != p.length() ) + if ( p.luaByte( poffset++ ) == L_ESC && poffset != p.length() ) poffset++; - } while ( p.charAt( poffset ) != ']' ); + } while ( p.luaByte( poffset ) != ']' ); return poffset + 1; default: return poffset; @@ -454,32 +476,32 @@ public class StrLib { boolean matchbracketclass( int c, int poff, int ec ) { boolean sig = true; - if ( p.charAt( poff + 1 ) == '^' ) { + if ( p.luaByte( poff + 1 ) == '^' ) { sig = false; poff++; } while ( ++poff < ec ) { - if ( p.charAt( poff ) == L_ESC ) { + if ( p.luaByte( poff ) == L_ESC ) { poff++; - if ( match_class( c, p.charAt( poff ) ) ) + if ( match_class( c, p.luaByte( poff ) ) ) return sig; } - else if ( ( p.charAt( poff + 1 ) == '-' ) && ( poff + 2 < ec ) ) { + else if ( ( p.luaByte( poff + 1 ) == '-' ) && ( poff + 2 < ec ) ) { poff += 2; - if ( p.charAt( poff - 2 ) <= c && c <= p.charAt( poff ) ) + if ( p.luaByte( poff - 2 ) <= c && c <= p.luaByte( poff ) ) return sig; } - else if ( p.charAt( poff ) == c ) return sig; + else if ( p.luaByte( poff ) == c ) return sig; } return !sig; } boolean singlematch( int c, int poff, int ep ) { - switch ( p.charAt( poff ) ) { + switch ( p.luaByte( poff ) ) { case '.': return true; - case L_ESC: return match_class( c, p.charAt( poff + 1 ) ); + case L_ESC: return match_class( c, p.luaByte( poff + 1 ) ); case '[': return matchbracketclass( c, poff, ep - 1 ); - default: return p.charAt( poff ) == c; + default: return p.luaByte( poff ) == c; } } @@ -488,96 +510,86 @@ public class StrLib { * where match ends, otherwise returns -1. */ int match( int soffset, int poffset ) { - // Java doesn't have goto -- we could wrap this in a while (true) - // and use labeled breaks instead, potentially. For now just use - // recursion - of course, this won't work very well, because java doesn't - // do tail recursion optimization - see: - // http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4726340 - // -// init: - // Check if we are at the end of the pattern - - // equivalent to the '\0' case in the C version, but our pattern - // string is not NUL-terminated. - if ( poffset == p.length() ) - return soffset; - switch ( p.charAt( poffset ) ) { - case '(': - if ( p.charAt( poffset + 1 ) == ')' ) - return start_capture( soffset, poffset + 2, CAP_POSITION ); - else - return start_capture( soffset, poffset + 1, CAP_UNFINISHED ); - case ')': - return end_capture( soffset, poffset + 1 ); - case L_ESC: - switch ( p.charAt( poffset+1 ) ) { - case 'b': - soffset = matchbalance( soffset, poffset + 2 ); - if ( soffset == -1 ) return -1; - // poffset += 4; goto init; - return match( soffset, poffset + 4 ); - case 'f': { - poffset += 2; - if ( p.charAt( poffset ) != '[' ) { - vm.push("Missing [ after %f in pattern"); - vm.lua_error(); - } - int ep = classend( poffset ); - int previous = ( soffset == 0 ) ? -1 : s.charAt( soffset - 1 ); - if ( matchbracketclass( previous, poffset, ep - 1 ) || - matchbracketclass( s.charAt( soffset ), poffset, ep - 1 ) ) - return -1; - // poffset = ep; goto init - return match( soffset, ep ); - } - default: { - int c = p.charAt( poffset + 1 ); - if ( Character.isDigit( (char) c ) ) { - soffset = match_capture( soffset, c ); - if ( soffset == -1 ) + while ( true ) { + // Check if we are at the end of the pattern - + // equivalent to the '\0' case in the C version, but our pattern + // string is not NUL-terminated. + if ( poffset == p.length() ) + return soffset; + switch ( p.luaByte( poffset ) ) { + case '(': + if ( p.luaByte( poffset + 1 ) == ')' ) + return start_capture( soffset, poffset + 2, CAP_POSITION ); + else + return start_capture( soffset, poffset + 1, CAP_UNFINISHED ); + case ')': + return end_capture( soffset, poffset + 1 ); + case L_ESC: + switch ( p.luaByte( poffset+1 ) ) { + case 'b': + soffset = matchbalance( soffset, poffset + 2 ); + if ( soffset == -1 ) return -1; + poffset += 4; + continue; + case 'f': { + poffset += 2; + if ( p.luaByte( poffset ) != '[' ) { + vm.push("Missing [ after %f in pattern"); + vm.lua_error(); + } + int ep = classend( poffset ); + int previous = ( soffset == 0 ) ? -1 : s.luaByte( soffset - 1 ); + if ( matchbracketclass( previous, poffset, ep - 1 ) || + matchbracketclass( s.luaByte( soffset ), poffset, ep - 1 ) ) return -1; - // poffset += 2; goto init; - return match( soffset, poffset + 2 ); + poffset = ep; + continue; } - return match2( soffset, poffset ); // XXX: better name + default: { + int c = p.luaByte( poffset + 1 ); + if ( Character.isDigit( (char) c ) ) { + soffset = match_capture( soffset, c ); + if ( soffset == -1 ) + return -1; + return match( soffset, poffset + 2 ); + } + } + } + case '$': + if ( poffset + 1 == p.length() ) + return ( soffset == s.length() ) ? soffset : -1; } + int ep = classend( poffset ); + boolean m = soffset < s.length() && singlematch( s.luaByte( soffset ), poffset, ep ); + int pc = ( ep < p.length() ) ? p.luaByte( ep ) : '\0'; + + switch ( pc ) { + case '?': + int res; + if ( m && ( ( res = match( soffset + 1, ep + 1 ) ) != -1 ) ) + return res; + poffset = ep + 1; + continue; + case '*': + return max_expand( soffset, poffset, ep ); + case '+': + return ( m ? max_expand( soffset + 1, poffset, ep ) : -1 ); + case '-': + return min_expand( soffset, poffset, ep ); + default: + if ( !m ) + return -1; + soffset++; + poffset = ep; + continue; } - case '$': - if ( poffset + 1 == p.length() ) - return ( soffset == s.length() ) ? soffset : -1; - return match2( soffset, poffset ); - } - return match2( soffset, poffset ); - } - - int match2( int soffset, int poffset ) { - int ep = classend( poffset ); - boolean m = soffset < s.length() && singlematch( s.charAt( soffset ), poffset, ep ); - int pc = ( ep < p.length() ) ? p.charAt( ep ) : '\0'; - - switch ( pc ) { - case '?': - int res; - if ( m && ( ( res = match( soffset + 1, ep + 1 ) ) != -1 ) ) - return res; - // p = ep + 1; goto init; - return match( soffset, ep + 1 ); - case '*': - return max_expand( soffset, poffset, ep ); - case '+': - return ( m ? max_expand( soffset + 1, poffset, ep ) : -1 ); - case '-': - return min_expand( soffset, poffset, ep ); - default: - if ( !m ) return -1; - // s++; p = ep; goto init; - return match( soffset+1, ep ); } } int max_expand( int soff, int poff, int ep ) { int i = 0; while ( soff + i < s.length() && - singlematch( s.charAt( soff + i ), poff, ep ) ) + singlematch( s.luaByte( soff + i ), poff, ep ) ) i++; while ( i >= 0 ) { int res = match( soff + i, ep + 1 ); @@ -593,7 +605,7 @@ public class StrLib { int res = match( soff, ep + 1 ); if ( res != -1 ) return res; - else if ( soff < s.length() && singlematch( s.charAt( soff ), poff, ep ) ) + else if ( soff < s.length() && singlematch( s.luaByte( soff ), poff, ep ) ) soff++; else return -1; } @@ -639,17 +651,17 @@ public class StrLib { vm.push( "unbalanced pattern" ); vm.lua_error(); } - if ( s.charAt( soff ) != p.charAt( poff ) ) + if ( s.luaByte( soff ) != p.luaByte( poff ) ) return -1; else { - int b = p.charAt( poff ); - int e = p.charAt( poff + 1 ); + int b = p.luaByte( poff ); + int e = p.luaByte( poff + 1 ); int cont = 1; while ( ++soff < s.length() ) { - if ( s.charAt( soff ) == e ) { + if ( s.luaByte( soff ) == e ) { if ( --cont == 0 ) return soff + 1; } - else if ( s.charAt( soff ) == b ) cont++; + else if ( s.luaByte( soff ) == b ) cont++; } } return -1; diff --git a/src/main/java/lua/value/LString.java b/src/main/java/lua/value/LString.java index 67945b9c..9254b127 100644 --- a/src/main/java/lua/value/LString.java +++ b/src/main/java/lua/value/LString.java @@ -121,7 +121,32 @@ public class LString extends LValue { public int charAt( int index ) { if ( index < 0 || index >= m_length ) throw new IndexOutOfBoundsException(); - return (int)m_bytes[ index ] & 0x0FF; + return luaByte( index ); + } + + /** Java version of strpbrk, which is a terribly named C function. */ + public int indexOfAny( LString accept ) { + final int ilimit = m_offset + m_length; + final int jlimit = accept.m_offset + accept.m_length; + for ( int i = m_offset; i < ilimit; ++i ) { + for ( int j = accept.m_offset; j < jlimit; ++j ) { + if ( m_bytes[i] == accept.m_bytes[j] ) { + return i - m_offset; + } + } + } + return -1; + } + + public int indexOf( LString s, int start ) { + final int slen = s.length(); + final int limit = m_offset + m_length - slen; + for ( int i = m_offset + start; i <= limit; ++i ) { + if ( equals( m_bytes, i, s.m_bytes, s.m_offset, slen ) ) { + return i; + } + } + return -1; } public static LString valueOf( double d ) { @@ -356,6 +381,6 @@ public class LString extends LValue { } public int luaByte(int index) { - return m_bytes[m_offset + index]; + return m_bytes[m_offset + index] & 0x0FF; } } diff --git a/src/test/res/strlib.lua b/src/test/res/strlib.lua new file mode 100644 index 00000000..f3a97509 --- /dev/null +++ b/src/test/res/strlib.lua @@ -0,0 +1,15 @@ +print( string.find( "", "" ) ) +print( string.find( "ababaabbaba", "abb" ) ) +print( string.find( "ababaabbaba", "abb", 7 ) ) + +print( string.match( "aabaa", "a*" ) ) +print( string.match( "aabaa", "a*", 3 ) ) +print( string.match( "aabaa", "a*b" ) ) +print( string.match( "aabaa", "a*b", 3 ) ) + +print( string.match( "abbaaababaabaaabaa", "b(a*)b" ) ) + +print( string.match( "abbaaababaabaaabaa", "b(a*)()b" ) ) +print( string.match( "abbaaababaabaaabaa", "b(a*)()b", 3 ) ) +print( string.match( "abbaaababaabaaabaa", "b(a*)()b", 8 ) ) +print( string.match( "abbaaababaabaaabaa", "b(a*)()b", 12 ) ) diff --git a/src/test/res/strlib.luac b/src/test/res/strlib.luac new file mode 100644 index 0000000000000000000000000000000000000000..3ec427f378bc8caed8ebc15c69e5b7378684ae32 GIT binary patch literal 982 zcmaiy%}&EG5QN9cmIlO+5JxJ+G33BA_yQ*`oax1;0wR?NmGqvxS@LGgCJmwj8%JZU z{q3wB+m{bp_Xr~*8IA$i&w8^i{q1_|IGS2Adtx}F?ksTMbZ9%Er= zW?JP)_Pkw>774w{Z|@IgTIB(67x9vXy(!mC1M8ACaTcvhuj$qQ*Ph4P#>h9nLW6Sv za=`9mQ&u%(&X8AAR<95}>zA@xK@1~^-kbCMoYx*A!-Pkw7+Bb$(z~ml-^QZMbj(xk ziYYC#qp|Ri3ni?kiCT9E=+JHW+!Z&>Ih^90quo;1b{FK96#8m(k0B*wA!(tQ^b|tU hgNsRjBczNhrX%u@^d>?&A&cphJf<`9|HPD`eF5!gX7~UA literal 0 HcmV?d00001