Add pattern matcher implementation ported from C version of Lua. Still

work to do:
(1) Replace use of recursion with while/continue
(2) Remove as many array bounds checks as possible
This commit is contained in:
Ian Farmer
2007-09-18 04:18:39 +00:00
parent 108fabc58f
commit 534e69fcfb
3 changed files with 382 additions and 6 deletions

View File

@@ -10,7 +10,7 @@ public class StrLib {
* string.byte (s [, i [, j]])
*
* Returns the internal numerical codes of the
* characters s[i], s[i+1], ···, s[j]. The default value for i is 1; the
* characters s[i], s[i+1], ..., s[j]. The default value for i is 1; the
* default value for j is i.
*
* Note that numerical codes are not necessarily portable across platforms.
@@ -30,7 +30,7 @@ public class StrLib {
}
/**
* string.char (···)
* string.char (...)
*
* Receives zero or more integers. Returns a string with length equal
* to the number of arguments, in which each character has the internal
@@ -83,7 +83,7 @@ public class StrLib {
}
/**
* string.format (formatstring, ···)
* string.format (formatstring, ...)
*
* Returns a formatted version of its variable number of arguments following
* the description given in its first argument (which must be a string).
@@ -207,9 +207,45 @@ public class StrLib {
/**
* string.match (s, pattern [, init])
*
* Looks for the first match of pattern in the string s. If it finds one, then match returns the captures from the pattern; otherwise it returns nil. If pattern specifies no captures, then the whole match is returned. A third, optional numerical argument init specifies where to start the search; its default value is 1 and may be negative.
* Looks for the first match of pattern in the string s. If it finds one,
* then match returns the captures from the pattern; otherwise it returns
* nil. If pattern specifies no captures, then the whole match is returned.
* A third, optional numerical argument init specifies where to start the
* search; its default value is 1 and may be negative.
*/
static void match( VM vm ) {
static void match( VM vm ) {
LString s = vm.getArgAsLuaString( 0 );
LString pat = vm.getArgAsLuaString( 1 );
int init = vm.getArgCount() > 2 ? vm.getArgAsInt( 2 ) : 1;
if ( init > 0 ) {
init = Math.min( init - 1, s.length() );
} else if ( init < 0 ) {
init = Math.max( 0, s.length() + init );
}
MatchState ms = new MatchState( vm, s, pat );
// TODO: check if pattern contains special characters,
// if not do a simpler search.
boolean anchor = false;
int poff = 0;
if ( pat.charAt( 0 ) == '^' ) {
anchor = true;
poff = 1;
}
int soff = init;
do {
int res;
ms.reset();
if ( ( res = ms.match( soff, poff ) ) != -1 ) {
ms.push_captures( true, soff, res );
return;
}
} while ( soff++ < s.length() && !anchor );
vm.setResult( LNil.NIL );
}
/**
@@ -294,5 +330,329 @@ public class StrLib {
}
// Pattern matching implementation
private static final int L_ESC = '%';
private static final int MAX_CAPTURES = 32;
private static final int CAP_UNFINISHED = -1;
private static final int CAP_POSITION = -2;
private static class MatchState {
final LString s;
final LString p;
final VM vm;
int level;
int[] cinit;
int[] clen;
MatchState( VM vm, LString s, LString pattern ) {
this.s = s;
this.p = pattern;
this.vm = vm;
this.level = 0;
this.cinit = new int[ MAX_CAPTURES ];
this.clen = new int[ MAX_CAPTURES ];
}
void reset() {
level = 0;
}
void push_captures( boolean wholeMatch, int soff, int end ) {
vm.setResult();
int nlevels = ( this.level == 0 && wholeMatch ) ? 1 : this.level;
for ( int i = 0; i < nlevels; ++i ) {
push_onecapture( i, soff, end );
}
}
private void push_onecapture( int i, int soff, int end ) {
if ( i >= this.level ) {
if ( i == 0 ) {
vm.push( s.substring( soff, end ) );
}
} else {
int l = clen[i];
if ( l == CAP_UNFINISHED ) {
vm.luaL_error( "unfinished capture" );
}
if ( l == CAP_POSITION ) {
vm.push( new LInteger( cinit[i] + 1 ) );
} else {
int begin = cinit[i];
vm.push( s.substring( begin, begin + l ) );
}
}
}
private int check_capture( int l ) {
l -= '1';
if ( l < 0 || l >= level || this.clen[l] == CAP_UNFINISHED ) {
vm.luaL_error("invalid capture index");
}
return l;
}
private int capture_to_close() {
int level = this.level;
for ( level--; level >= 0; level-- )
if ( clen[level] == CAP_UNFINISHED )
return level;
vm.push("invalid pattern capture");
vm.lua_error();
return 0;
}
int classend( int poffset ) {
switch ( p.charAt( poffset++ ) ) {
case L_ESC:
if ( poffset == p.length() ) {
vm.push( "malformed pattern (ends with %)" );
vm.lua_error();
}
return poffset + 1;
case '[':
if ( p.charAt( poffset ) == '^' ) poffset++;
do {
if ( poffset == p.length() ) {
vm.push( "malformed pattern (missing ])" );
vm.lua_error();
}
if ( p.charAt( poffset++ ) == L_ESC && poffset != p.length() )
poffset++;
} while ( p.charAt( poffset ) != ']' );
return poffset + 1;
default:
return poffset;
}
}
static boolean isalpha( int c ) {
return ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' );
}
static boolean match_class( int c, int cl ) {
boolean res;
switch ( Character.toLowerCase( c ) ) {
case 'a': res = isalpha( c ); break;
case 'd': res = Character.isDigit( (char) c ); break;
case 'l': res = Character.isLowerCase( (char) c ); break;
case 'u': res = Character.isUpperCase( (char) c ); break;
case 'z': res = ( c == 0 ); break;
case 'c':
case 'p':
case 's':
case 'w':
case 'x':
throw new RuntimeException("match: unimplemented: %" + (char)cl );
default: return cl == c;
}
return ( Character.isLowerCase( (char) cl ) ? res : !res );
}
boolean matchbracketclass( int c, int poff, int ec ) {
boolean sig = true;
if ( p.charAt( poff + 1 ) == '^' ) {
sig = false;
poff++;
}
while ( ++poff < ec ) {
if ( p.charAt( poff ) == L_ESC ) {
poff++;
if ( match_class( c, p.charAt( poff ) ) )
return sig;
}
else if ( ( p.charAt( poff + 1 ) == '-' ) && ( poff + 2 < ec ) ) {
poff += 2;
if ( p.charAt( poff - 2 ) <= c && c <= p.charAt( poff ) )
return sig;
}
else if ( p.charAt( poff ) == c ) return sig;
}
return !sig;
}
boolean singlematch( int c, int poff, int ep ) {
switch ( p.charAt( poff ) ) {
case '.': return true;
case L_ESC: return match_class( c, p.charAt( poff + 1 ) );
case '[': return matchbracketclass( c, poff, ep - 1 );
default: return p.charAt( poff ) == c;
}
}
/**
* Perform pattern matching. If there is a match, returns offset into s
* where match ends, otherwise returns -1.
*/
int match( int soffset, int poffset ) {
// Java doesn't have goto -- we could wrap this in a while (true)
// and use labeled breaks instead, potentially. For now just use
// recursion - of course, this won't work very well, because java doesn't
// do tail recursion optimization - see:
// http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4726340
//
// init:
// Check if we are at the end of the pattern -
// equivalent to the '\0' case in the C version, but our pattern
// string is not NUL-terminated.
if ( poffset == p.length() )
return soffset;
switch ( p.charAt( poffset ) ) {
case '(':
if ( p.charAt( poffset + 1 ) == ')' )
return start_capture( soffset, poffset + 2, CAP_POSITION );
else
return start_capture( soffset, poffset + 1, CAP_UNFINISHED );
case ')':
return end_capture( soffset, poffset + 1 );
case L_ESC:
switch ( p.charAt( poffset+1 ) ) {
case 'b':
soffset = matchbalance( soffset, poffset + 2 );
if ( soffset == -1 ) return -1;
// poffset += 4; goto init;
return match( soffset, poffset + 4 );
case 'f': {
poffset += 2;
if ( p.charAt( poffset ) != '[' ) {
vm.push("Missing [ after %f in pattern");
vm.lua_error();
}
int ep = classend( poffset );
int previous = ( soffset == 0 ) ? -1 : s.charAt( soffset - 1 );
if ( matchbracketclass( previous, poffset, ep - 1 ) ||
matchbracketclass( s.charAt( soffset ), poffset, ep - 1 ) )
return -1;
// poffset = ep; goto init
return match( soffset, ep );
}
default: {
int c = p.charAt( poffset + 1 );
if ( Character.isDigit( (char) c ) ) {
soffset = match_capture( soffset, c );
if ( soffset == -1 )
return -1;
// poffset += 2; goto init;
return match( soffset, poffset + 2 );
}
return match2( soffset, poffset ); // XXX: better name
}
}
case '$':
if ( poffset + 1 == p.length() )
return ( soffset == s.length() ) ? soffset : -1;
return match2( soffset, poffset );
}
return match2( soffset, poffset );
}
int match2( int soffset, int poffset ) {
int ep = classend( poffset );
boolean m = soffset < s.length() && singlematch( s.charAt( soffset ), poffset, ep );
int pc = ( ep < p.length() ) ? p.charAt( ep ) : '\0';
switch ( pc ) {
case '?':
int res;
if ( m && ( ( res = match( soffset + 1, ep + 1 ) ) != -1 ) )
return res;
// p = ep + 1; goto init;
return match( soffset, ep + 1 );
case '*':
return max_expand( soffset, poffset, ep );
case '+':
return ( m ? max_expand( soffset + 1, poffset, ep ) : -1 );
case '-':
return min_expand( soffset, poffset, ep );
default:
if ( !m ) return -1;
// s++; p = ep; goto init;
return match( soffset+1, ep );
}
}
int max_expand( int soff, int poff, int ep ) {
int i = 0;
while ( soff + i < s.length() &&
singlematch( s.charAt( soff + i ), poff, ep ) )
i++;
while ( i >= 0 ) {
int res = match( soff + i, ep + 1 );
if ( res != -1 )
return res;
i--;
}
return -1;
}
int min_expand( int soff, int poff, int ep ) {
for ( ;; ) {
int res = match( soff, ep + 1 );
if ( res != -1 )
return res;
else if ( soff < s.length() && singlematch( s.charAt( soff ), poff, ep ) )
soff++;
else return -1;
}
}
int start_capture( int soff, int poff, int what ) {
int res;
int level = this.level;
if ( level >= MAX_CAPTURES ) {
vm.push( "too many captures" );
vm.lua_error();
}
cinit[ level ] = soff;
clen[ level ] = what;
this.level = level + 1;
if ( ( res = match( soff, poff ) ) == -1 )
this.level--;
return res;
}
int end_capture( int soff, int poff ) {
int l = capture_to_close();
int res;
clen[l] = soff - cinit[l];
if ( ( res = match( soff, poff ) ) == -1 )
clen[l] = CAP_UNFINISHED;
return res;
}
int match_capture( int soff, int l ) {
l = check_capture( l );
int len = clen[ l ];
if ( ( s.length() - soff ) >= len &&
LString.equals( s, cinit[l], s, soff, len ) )
return soff + len;
else
return -1;
}
int matchbalance( int soff, int poff ) {
final int plen = p.length();
if ( poff == plen || poff + 1 == plen ) {
vm.push( "unbalanced pattern" );
vm.lua_error();
}
if ( s.charAt( soff ) != p.charAt( poff ) )
return -1;
else {
int b = p.charAt( poff );
int e = p.charAt( poff + 1 );
int cont = 1;
while ( ++soff < s.length() ) {
if ( s.charAt( soff ) == e ) {
if ( --cont == 0 ) return soff + 1;
}
else if ( s.charAt( soff ) == b ) cont++;
}
}
return -1;
}
}
}

View File

@@ -118,6 +118,12 @@ public class LString extends LValue {
return new LString( m_bytes, m_offset + beginIndex, endIndex - beginIndex );
}
public int charAt( int index ) {
if ( index < 0 || index >= m_length )
throw new IndexOutOfBoundsException();
return (int)m_bytes[ index ] & 0x0FF;
}
public static LString valueOf( double d ) {
return new LString( String.valueOf( d ) );
}
@@ -275,7 +281,13 @@ public class LString extends LValue {
return s_stringMT;
}
public static boolean equals( LString a, int i, LString b, int j, int n ) {
return equals( a.m_bytes, a.m_offset + i, b.m_bytes, b.m_offset + j, n );
}
public static boolean equals( byte[] a, int i, byte[] b, int j, int n ) {
if ( a.length < i + n || b.length < j + n )
return false;
final int imax = i + n;
final int jmax = j + n;
while ( i < imax && j < jmax ) {

View File

@@ -76,6 +76,10 @@ public class LuaJTest extends TestCase {
runTest( "setlist" );
}
public void testStrLib() throws IOException, InterruptedException {
runTest( "strlib" );
}
public void testType() throws IOException, InterruptedException {
runTest( "type" );
}