You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

479 lines
15 KiB

/* /////////////////////////////////////////////////////////////////////////
* File: src/pattern.cpp
*
* Purpose: C string object for shwild implementation
*
* Created: 17th June 2005
* Updated: 20th December 2011
*
* Home: http://shwild.org/
*
* Copyright (c) 2005-2011, Sean Kelly and Matthew Wilson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* - Neither the names of Matthew Wilson and Sean Kelly nor the names of
* any contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* ////////////////////////////////////////////////////////////////////// */
/* /////////////////////////////////////////////////////////////////////////
* Includes
*/
#include <shwild/shwild.h>
#include "shwild_stlsoft.h"
#include "shwild_safestr.h"
#if defined(STLSOFT_COMPILER_IS_MSVC) && \
defined(SHWILD_USING_SAFE_STR_FUNCTIONS)
# pragma warning(disable : 4996)
#endif /* SHWILD_USING_SAFE_STR_FUNCTIONS && compiler */
#include "pattern.hpp"
#include "shwild_assert.h"
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#ifdef _MSC_VER
# include <crtdbg.h>
#endif /* _MSC_VER */
/* /////////////////////////////////////////////////////////////////////////
* Compiler features
*/
/* /////////////////////////////////////////////////////////////////////////
* Checks
*/
#if CHAR_MAX >= INT_MAX || CHAR_MIN <= INT_MIN
# error char must be smaller than int
#endif
/* /////////////////////////////////////////////////////////////////////////
* Helper functions
*/
namespace
{
char const* strnchr(char const* s, size_t len, char ch)
{
for(; 0 != len; ++s, --len)
{
if(ch == 0[s])
{
return s;
}
}
return NULL;
}
} // anonymous namespace
/* /////////////////////////////////////////////////////////////////////////
* API functions
*/
static int get_token(char const* buf, size_t* len , unsigned flags)
{
int tok;
SHWILD_ASSERT( len && buf );
*len = 1;
if( ( tok = *buf ) != TOK_END )
{
switch( tok )
{
case '\\':
if(0 != (flags & SHWILD_F_SUPPRESS_BACKSLASH_ESCAPE))
{
return tok;
}
++buf;
++*len;
tok = *buf;
switch( tok )
{
case '\\':
case '?':
case '*':
case '[':
case ']':
case '^':
return tok;
default:
return TOK_INVALID;
}
case '?':
return TOK_WILD_1;
case '*':
return TOK_WILD_N;
case '[':
if(0 != (flags & SHWILD_F_SUPPRESS_RANGE_SUPPORT))
{
return tok;
}
return TOK_RANGE_BEG;
case ']':
if(0 != (flags & SHWILD_F_SUPPRESS_RANGE_SUPPORT))
{
return tok;
}
return TOK_RANGE_END;
default:
return tok;
}
}
return TOK_END;
}
static int get_literal( shwild_slice_t& content, node_buffer_t &scratch, char const* buf, size_t* len, unsigned flags, int bInRange )
{
char const* pos = buf;
int tok;
size_t tok_len;
SHWILD_ASSERT( NULL != buf && 0 != len );
if(!scratch.resize(1)) // This trims without discarding any hard-won heap mem.
{
return TOK_ENOMEM;
}
/*
* NOTE: I'm cheating here a bit and checking the token by range rather than
* against specific token values.
*/
for(;;)
{
tok = get_token( pos, &tok_len, flags );
if( bInRange &&
0 == (SHWILD_F_SUPPRESS_RANGE_LITERAL_WILDCARD_SUPPORT & flags))
{
if(tok == TOK_WILD_1)
{
tok = '?';
}
if(tok == TOK_WILD_N)
{
tok = '*';
}
}
if( tok != TOK_END &&
tok >= CHAR_MIN &&
tok <= CHAR_MAX )
{
const size_t sz = scratch.size();
if(!scratch.resize(1 + sz))
{
return TOK_ENOMEM;
}
scratch[sz] = static_cast<char>(tok);
pos += tok_len;
}
else
{
break;
}
}
content.len = scratch.size() - 1;
content.ptr = &scratch[1];
*len = static_cast<size_t>(pos - buf);
return tok;
}
int get_node( node_t* node, node_buffer_t &scratch, char const* buf, size_t* len, unsigned flags )
{
int tok;
size_t tok_len;
SHWILD_ASSERT( node && buf && len );
*len = 0;
/* memset( node, 0, sizeof( node_t ) ); */
switch( tok = get_token( buf, &tok_len, flags ) )
{
case TOK_INVALID:
case TOK_RANGE_END:
return SHWILD_RC_PARSE_ERROR;
case TOK_END:
node->type = NODE_END;
node->data.len = 0;
break;
case TOK_WILD_1:
node->type = NODE_WILD_1;
node->data.len = 0;
break;
case TOK_WILD_N:
node->type = NODE_WILD_N;
node->data.len = 0;
break;
case TOK_RANGE_BEG:
node->type = NODE_RANGE;
node->data.len = 0;
*len += tok_len;
tok = get_literal( node->data, scratch, buf + tok_len, &tok_len, flags, 1 );
if(TOK_ENOMEM == tok)
{
return SHWILD_RC_ALLOC_ERROR;
}
else if(tok != TOK_RANGE_END)
{
*len = 0;
return SHWILD_RC_PARSE_ERROR;
}
*len += tok_len;
tok = get_token( buf + *len, &tok_len, flags );
STLSOFT_SUPPRESS_UNUSED(tok);
SHWILD_ASSERT( tok == TOK_RANGE_END );
/* Account for !range */
if( 0 == (flags & SHWILD_F_SUPPRESS_RANGE_NOT_SUPPORT) &&
node->data.len > 0 &&
'^' == node->data.ptr[0] &&
'\\' != buf[tok_len])
{
node->type = NODE_NOT_RANGE;
++node->data.ptr;
--node->data.len;
}
/* Now must post-process if got any embedded '-'. */
if( 0 == (flags & SHWILD_F_SUPPRESS_RANGE_CONTINUUM_SUPPORT) &&
node->data.len > 0)
{
/* Search for the first '-', ignoring the first ... */
char const* first = node->data.ptr;
char const* minus = strnchr(first + 1, node->data.len - 1, '-');
char const* const last = node->data.ptr + (node->data.len - 1);
if(SHWILD_F_SUPPRESS_RANGE_LEADTRAIL_LITERAL_HYPHEN_SUPPORT == (flags & SHWILD_F_SUPPRESS_RANGE_LEADTRAIL_LITERAL_HYPHEN_SUPPORT))
{
if( '-' == *first ||
'-' == *last)
{
return SHWILD_RC_PARSE_ERROR;
}
}
/* ... and the last, since they're treated as literals. */
if( NULL != minus &&
minus != last)
{
node_buffer_t xstr(1);
char const* begin = node->data.ptr;
char const* const end = begin + node->data.len;
for(; NULL != minus && last != minus; )
{
char prev = minus[-1];
char post = minus[+1];
if( !isalnum(prev) ||
!isalnum(post) ||
isdigit(prev) != isdigit(post))
{
return SHWILD_RC_PARSE_ERROR; /* "[%-&]", "[a-9]" not valid */
}
const size_t sz = xstr.size();
if(!xstr.resize(sz + (static_cast<size_t>(minus - begin) - 1)))
{
return SHWILD_RC_ALLOC_ERROR;
}
else
{
std::copy(begin, minus - 1, xstr.begin() + sz);
}
if(isupper(prev) == isupper(post))
{
if(post < prev)
{
if(0 != (flags & SHWILD_F_SUPPRESS_RANGE_CONTINUUM_HIGHLOW_SUPPORT))
{
return SHWILD_RC_PARSE_ERROR;
}
else
{
prev = minus[+1];
post = minus[-1];
}
}
{ for(char ch = prev; ch <= post; ++ch)
{
const size_t sz = xstr.size();
if(!xstr.resize(1 + sz))
{
return SHWILD_RC_ALLOC_ERROR;
}
else
{
xstr[sz] = ch;
}
}}
}
else
{
if(0 != (flags & SHWILD_F_SUPPRESS_RANGE_CONTINUUM_CROSSCASE_SUPPORT))
{
return SHWILD_RC_PARSE_ERROR;
}
else
{
char prevLower = static_cast<char>(tolower(prev));
char postLower = static_cast<char>(tolower(post));
char prevUpper = static_cast<char>(toupper(prev));
char postUpper = static_cast<char>(toupper(post));
SHWILD_ASSERT((postLower < prevLower) == (postUpper < prevUpper));
if(postLower < prevLower)
{
if(0 != (flags & SHWILD_F_SUPPRESS_RANGE_CONTINUUM_HIGHLOW_SUPPORT))
{
return SHWILD_RC_PARSE_ERROR;
}
else
{
// TODO: Use std::swap()
char ch;
ch = prevLower;
prevLower = postLower;
postLower = ch;
ch = prevUpper;
prevUpper = postUpper;
postUpper = ch;
}
}
{ for(char ch = prevLower; ch <= postLower; ++ch)
{
const size_t sz = xstr.size();
if(!xstr.resize(1 + sz))
{
return SHWILD_RC_ALLOC_ERROR;
}
else
{
xstr[sz] = ch;
}
}}
{ for(char ch = prevUpper; ch <= postUpper; ++ch)
{
const size_t sz = xstr.size();
if(!xstr.resize(1 + sz))
{
return SHWILD_RC_ALLOC_ERROR;
}
else
{
xstr[sz] = ch;
}
}}
}
}
begin = minus + 2;
if(end == begin)
{
break;
}
else
{
if('-' == *begin)
{
return SHWILD_RC_PARSE_ERROR; /* "[0-1-234]" not valid */
}
minus = strnchr(begin, static_cast<size_t>(end - begin), '-');
}
}
const size_t sz = xstr.size();
if(!xstr.resize(sz + static_cast<size_t>(end - begin)))
{
return SHWILD_RC_ALLOC_ERROR;
}
else
{
std::copy(begin, end, xstr.begin() + sz);
}
scratch.swap(xstr);
node->data.len = scratch.size() - 1;
node->data.ptr = &scratch[1];
}
}
break;
default:
node->type = NODE_LITERAL;
node->data.len = 0;
tok = get_literal( node->data, scratch, buf, &tok_len, flags, 0 );
if(TOK_ENOMEM == tok)
{
return SHWILD_RC_ALLOC_ERROR;
}
}
*len += tok_len;
return 0;
}
void node_init( node_t* node )
{
node->type = NODE_NOTHING;
node->data.len = 0;
}
void node_reset( node_t* node )
{
node->type = NODE_NOTHING;
node->data.len = 0;
}
/* ///////////////////////////// end of file //////////////////////////// */