Newer
Older
monitord / monitord / regexp.cpp
@root root on 23 Jan 2012 19 KB Migration from SVN revision 455
////////////////////////////////////////////////////////////////////////////////
// RegExp.cpp
////////////////////////////////////////////////////////////////////////////////

#include "stdafx.h"
#include "RegExp.h"


// definition	number	opnd?	meaning
#define	END		0		// no	End of program.
#define	BOL		1		// no	Match beginning of line.
#define	EOL		2		// no	Match end of line.
#define	ANY		3		// no	Match any character.
#define	ANYOF	4		// str	Match any of these.
#define	ANYBUT	5		// str	Match any but one of these.
#define	BRANCH	6		// node	Match this, or the next..\&.
#define	BACK	7		// no	"next" ptr points backward.
#define	EXACTLY	8		// str	Match this string.
#define	NOTHING	9		// no	Match empty string.
#define	STAR	10		// node	Match this 0 or more times.
#define	PLUS	11		// node	Match this 1 or more times.
#define	OPEN	20		// no	Sub-RE starts here.
						//		OPEN+1 is number 1, etc.
#define	CLOSE	30		// no	Analogous to OPEN.

// Utility definitions.

#define	FAIL(m)		{ regerror(m); return(NULL); }
#define	ISREPN(c)	((c) == _T('*') || (c) == _T('+') || (c) == _T('?'))
#define	META		"^$.[()|?+*\\"

// Flags to be passed up and down.

#define	HASWIDTH	01	// Known never to match null string.
#define	SIMPLE		02	// Simple enough to be STAR/PLUS operand.
#define	SPSTART		04	// Starts with * or +.
#define	WORST		0	// Worst case.


CRegExp::CRegExp()
{
	bCompiled = FALSE;
	program = NULL;
	sFoundText = NULL;

	for( int i = 0; i < NSUBEXP; i++ )
	{
		startp[i] = NULL;
		endp[i] = NULL;
	}
}

CRegExp::~CRegExp()
{
	delete program;
	delete sFoundText;
}


CRegExp* CRegExp::RegComp(const TCHAR *exp)
{
	TCHAR *scan;
	int flags;

	if (exp == NULL)
		return NULL;

	bCompiled = TRUE;

	// First pass: determine size, legality.
	bEmitCode = FALSE;
	regparse = (TCHAR *)exp;
	regnpar = 1;
	regsize = 0L;
	regdummy[0] = NOTHING;
	regdummy[1] = regdummy[2] = 0;
	regcode = regdummy;
	if (reg(0, &flags) == NULL)
		return(NULL);

	// Allocate space.
	delete program;
	program = new TCHAR[regsize];
	memset( program, 0, regsize * sizeof(TCHAR) );

	if (program == NULL)
		return NULL;

	// Second pass: emit code.
	bEmitCode = TRUE;
	regparse = (TCHAR *)exp;
	regnpar = 1;
	regcode = program;
	if (reg(0, &flags) == NULL)
		return NULL;

	// Dig out information for optimizations.
	regstart = _T('\0');		// Worst-case defaults.
	reganch = 0;
	regmust = NULL;
	regmlen = 0;
	scan = program;		// First BRANCH.
	if (OP(regnext(scan)) == END)
	{
		// Only one top-level choice.
		scan = OPERAND(scan);

		// Starting-point info.
		if (OP(scan) == EXACTLY)
			regstart = *OPERAND(scan);
		else if (OP(scan) == BOL)
			reganch = 1;

		// If there's something expensive in the r.e., find the
		// longest literal string that must appear and make it the
		// regmust.  Resolve ties in favor of later strings, since
		// the regstart check works with the beginning of the r.e.
		// and avoiding duplication strengthens checking.  Not a
		// strong reason, but sufficient in the absence of others.

		if (flags&SPSTART)
		{
			char *longest = NULL;
			size_t len = 0;

			for (; scan != NULL; scan = regnext(scan))
				if (OP(scan) == EXACTLY && _tcslen(OPERAND(scan)) >= len)
				{
					longest = OPERAND(scan);
					len = _tcslen(OPERAND(scan));
				}
			regmust = longest;
			regmlen = (int)len;
		}
	}

	return this;
}

// reg - regular expression, i.e. main body or parenthesized thing
//
// Caller must absorb opening parenthesis.
//
// Combining parenthesis handling with the base level of regular expression
// is a trifle forced, but the need to tie the tails of the branches to what
// follows makes it hard to avoid.



TCHAR *CRegExp::reg(int paren, int *flagp)
{
	char *ret;
	char *br;
	char *ender;
	int parno;
	int flags;

	*flagp = HASWIDTH;	// Tentatively.

	if (paren)
	{
		// Make an OPEN node.
		if (regnpar >= NSUBEXP)
		{
			TRACE1("Too many (). NSUBEXP is set to %d\n", NSUBEXP );
			return NULL;
		}
		parno = regnpar;
		regnpar++;
		ret = regnode(OPEN+parno);
	}

	// Pick up the branches, linking them together.
	br = regbranch(&flags);
	if (br == NULL)
		return(NULL);
	if (paren)
		regtail(ret, br);	// OPEN -> first.
	else
		ret = br;
	*flagp &= ~(~flags&HASWIDTH);	// Clear bit if bit 0.
	*flagp |= flags&SPSTART;
	while (*regparse == _T('|')) {
		regparse++;
		br = regbranch(&flags);
		if (br == NULL)
			return(NULL);
		regtail(ret, br);	// BRANCH -> BRANCH.
		*flagp &= ~(~flags&HASWIDTH);
		*flagp |= flags&SPSTART;
	}

	// Make a closing node, and hook it on the end.
	ender = regnode((paren) ? CLOSE+parno : END);
	regtail(ret, ender);

	// Hook the tails of the branches to the closing node.
	for (br = ret; br != NULL; br = regnext(br))
		regoptail(br, ender);

	// Check for proper termination.
	if (paren && *regparse++ != _T(')'))
	{
		TRACE0("unterminated ()\n");
		return NULL;
	}
	else if (!paren && *regparse != _T('\0'))
	{
		if (*regparse == _T(')'))
		{
			TRACE0("unmatched ()\n");
			return NULL;
		}
		else
		{
			TRACE0("internal error: junk on end\n");
			return NULL;
		}
		// NOTREACHED
	}

	return(ret);
}




//
// regbranch - one alternative of an | operator
//
// Implements the concatenation operator.

TCHAR *CRegExp::regbranch(int *flagp)
{
	TCHAR *ret;
	TCHAR *chain;
	TCHAR *latest;
	int flags;
	int c;

	*flagp = WORST;				// Tentatively.

	ret = regnode(BRANCH);
	chain = NULL;
	while ((c = *regparse) != _T('\0') && c != _T('|') && c != _T(')')) {
		latest = regpiece(&flags);
		if (latest == NULL)
			return(NULL);
		*flagp |= flags&HASWIDTH;
		if (chain == NULL)		// First piece.
			*flagp |= flags&SPSTART;
		else
			regtail(chain, latest);
		chain = latest;
	}
	if (chain == NULL)			// Loop ran zero times.
		(void) regnode(NOTHING);

	return(ret);
}

//
// regpiece - something followed by possible [*+?]
//
// Note that the branching code sequences used for ? and the general cases
// of * and + are somewhat optimized:  they use the same NOTHING node as
// both the endmarker for their branch list and the body of the last branch.
// It might seem that this node could be dispensed with entirely, but the
// endmarker role is not redundant.

TCHAR *CRegExp::regpiece(int *flagp)
{
	TCHAR *ret;
	TCHAR op;
	TCHAR *next;
	int flags;

	ret = regatom(&flags);
	if (ret == NULL)
		return(NULL);

	op = *regparse;
	if (!ISREPN(op)) {
		*flagp = flags;
		return(ret);
	}

	if (!(flags&HASWIDTH) && op != _T('?'))
	{
		TRACE0("*+ operand could be empty\n");
		return NULL;
	}

	switch (op) {
	case _T('*'):	*flagp = WORST|SPSTART;			break;
	case _T('+'):	*flagp = WORST|SPSTART|HASWIDTH;	break;
	case _T('?'):	*flagp = WORST;				break;
	}

	if (op == _T('*') && (flags&SIMPLE))
		reginsert(STAR, ret);
	else if (op == _T('*')) {
		// Emit x* as (x&|), where & means "self".
		reginsert(BRANCH, ret);		// Either x
		regoptail(ret, regnode(BACK));	// and loop
		regoptail(ret, ret);		// back
		regtail(ret, regnode(BRANCH));	// or
		regtail(ret, regnode(NOTHING));	// null.
	} else if (op == _T('+') && (flags&SIMPLE))
		reginsert(PLUS, ret);
	else if (op == _T('+')) {
		// Emit x+ as x(&|), where & means "self".
		next = regnode(BRANCH);		// Either
		regtail(ret, next);
		regtail(regnode(BACK), ret);	// loop back
		regtail(next, regnode(BRANCH));	// or
		regtail(ret, regnode(NOTHING));	// null.
	} else if (op == _T('?')) {
		// Emit x? as (x|)
		reginsert(BRANCH, ret);		// Either x
		regtail(ret, regnode(BRANCH));	// or
		next = regnode(NOTHING);		// null.
		regtail(ret, next);
		regoptail(ret, next);
	}
	regparse++;
	if (ISREPN(*regparse))
	{
		TRACE0("nested *?+\n");
		return NULL;
	}

	return(ret);
}

//
// regatom - the lowest level
//
// Optimization:  gobbles an entire sequence of ordinary characters so that
// it can turn them into a single node, which is smaller to store and
// faster to run.  Backslashed characters are exceptions, each becoming a
// separate node; the code is simpler that way and it's not worth fixing.

TCHAR *CRegExp::regatom(int *flagp)
{
	TCHAR *ret;
	int flags;

	*flagp = WORST;		// Tentatively.

	switch (*regparse++) {
	case _T('^'):
		ret = regnode(BOL);
		break;
	case _T('$'):
		ret = regnode(EOL);
		break;
	case _T('.'):
		ret = regnode(ANY);
		*flagp |= HASWIDTH|SIMPLE;
		break;
	case _T('['): {
		int range;
		int rangeend;
		int c;

		if (*regparse == _T('^')) {	// Complement of range.
			ret = regnode(ANYBUT);
			regparse++;
		} else
			ret = regnode(ANYOF);
		if ((c = *regparse) == _T(']') || c == _T('-')) {
			regc(c);
			regparse++;
		}
		while ((c = *regparse++) != _T('\0') && c != _T(']')) {
			if (c != _T('-'))
				regc(c);
			else if ((c = *regparse) == _T(']') || c == _T('\0'))
				regc(_T('-'));
			else
			{
				range = (unsigned) (TCHAR)*(regparse-2);
				rangeend = (unsigned) (TCHAR)c;
				if (range > rangeend)
				{
					TRACE0("invalid [] range\n");
					return NULL;
				}
				for (range++; range <= rangeend; range++)
					regc(range);
				regparse++;
			}
		}
		regc(_T('\0'));
		if (c != _T(']'))
		{
			TRACE0("unmatched []\n");
			return NULL;
		}
		*flagp |= HASWIDTH|SIMPLE;
		break;
		}
	case _T('('):
		ret = reg(1, &flags);
		if (ret == NULL)
			return(NULL);
		*flagp |= flags&(HASWIDTH|SPSTART);
		break;
	case _T('\0'):
	case _T('|'):
	case _T(')'):
		// supposed to be caught earlier
		TRACE0("internal error: \\0|) unexpected\n");
		return NULL;
		break;
	case _T('?'):
	case _T('+'):
	case _T('*'):
		TRACE0("?+* follows nothing\n");
		return NULL;
		break;
	case _T('\\'):
		if (*regparse == _T('\0'))
		{
			TRACE0("trailing \\\n");
			return NULL;
		}
		ret = regnode(EXACTLY);
		regc(*regparse++);
		regc(_T('\0'));
		*flagp |= HASWIDTH|SIMPLE;
		break;
	default: {
		size_t len;
		TCHAR ender;

		regparse--;
		len = _tcscspn(regparse, META);
		if (len == 0)
		{
			TRACE0("internal error: strcspn 0\n");
			return NULL;
		}
		ender = *(regparse+len);
		if (len > 1 && ISREPN(ender))
			len--;		// Back off clear of ?+* operand.
		*flagp |= HASWIDTH;
		if (len == 1)
			*flagp |= SIMPLE;
		ret = regnode(EXACTLY);
		for (; len > 0; len--)
			regc(*regparse++);
		regc(_T('\0'));
		break;
		}
	}

	return(ret);
}



// reginsert - insert an operator in front of already-emitted operand
//
// Means relocating the operand.

void CRegExp::reginsert(TCHAR op, TCHAR *opnd)
{
	TCHAR *place;

	if (!bEmitCode) {
		regsize += 3;
		return;
	}

	(void) memmove(opnd+3, opnd, (size_t)((regcode - opnd)*sizeof(TCHAR)));
	regcode += 3;

	place = opnd;		// Op node, where operand used to be.
	*place++ = op;
	*place++ = _T('\0');
	*place++ = _T('\0');
}

//
// regtail - set the next-pointer at the end of a node chain

void CRegExp::regtail(TCHAR *p, TCHAR *val)
{
	TCHAR *scan;
	TCHAR *temp;
//	int offset;

	if (!bEmitCode)
		return;

	// Find last node.
	for (scan = p; (temp = regnext(scan)) != NULL; scan = temp)
		continue;

	*((short *)(scan+1)) = (OP(scan) == BACK) ? scan - val : val - scan;
}


// regoptail - regtail on operand of first argument; nop if operandless

void CRegExp::regoptail(TCHAR *p, TCHAR *val)
{
	// "Operandless" and "op != BRANCH" are synonymous in practice.
	if (!bEmitCode || OP(p) != BRANCH)
		return;
	regtail(OPERAND(p), val);
}


// RegFind	- match a regexp against a string
// Returns	- Returns position of regexp or -1
//			  if regular expression not found
// Note		- The regular expression should have been
//			  previously compiled using RegComp
int CRegExp::RegFind(const TCHAR *str)
{
	TCHAR *string = (TCHAR *)str;	// avert const poisoning
	TCHAR *s;

	// Delete any previously stored found string
	delete sFoundText;
	sFoundText = NULL;

	// Be paranoid.
	if(string == NULL)
	{
		TRACE0("NULL argument to regexec\n");
		return(-1);
	}

	// Check validity of regex
	if (!bCompiled)
	{
		TRACE0("No regular expression provided yet.\n");
		return(-1);
	}

	// If there is a "must appear" string, look for it.
	if (regmust != NULL && _tcsstr(string, regmust) == NULL)
		return(-1);

	// Mark beginning of line for ^
	regbol = string;

	// Simplest case:  anchored match need be tried only once.
	if (reganch)
	{
		if( regtry(string) )
		{
			// Save the found substring in case we need it
			sFoundText = new TCHAR[GetFindLen()+1];
			sFoundText[GetFindLen()] = _T('\0');
			_tcsncpy(sFoundText, string, GetFindLen() );

			return 0;
		}
		//String not found
		return -1;
	}

	// Messy cases:  unanchored match.
	if (regstart != _T('\0'))
	{
		// We know what TCHAR it must start with.
		for (s = string; s != NULL; s = _tcschr(s+1, regstart))
			if (regtry(s))
			{
				int nPos = s-str;

				// Save the found substring in case we need it later
				sFoundText = new TCHAR[GetFindLen()+1];
				sFoundText[GetFindLen()] = _T('\0');
				_tcsncpy(sFoundText, s, GetFindLen() );

				return nPos;
			}
		return -1;
	}
	else
	{
		// We don't -- general case
		for (s = string; !regtry(s); s++)
			if (*s == _T('\0'))
				return(-1);

		int nPos = s-str;

		// Save the found substring in case we need it later
		sFoundText = new TCHAR[GetFindLen()+1];
		sFoundText[GetFindLen()] = _T('\0');
		_tcsncpy(sFoundText, s, GetFindLen() );

		return nPos;
	}
	// NOTREACHED
}


// regtry - try match at specific point

int	CRegExp::regtry(TCHAR *string)
{
	int i;
	TCHAR **stp;
	TCHAR **enp;

	reginput = string;

	stp = startp;
	enp = endp;
	for (i = NSUBEXP; i > 0; i--)
	{
		*stp++ = NULL;
		*enp++ = NULL;
	}
	if (regmatch(program))
	{
		startp[0] = string;
		endp[0] = reginput;
		return(1);
	}
	else
		return(0);
}

// regmatch - main matching routine
//
// Conceptually the strategy is simple:  check to see whether the current
// node matches, call self recursively to see whether the rest matches,
// and then act accordingly.  In practice we make some effort to avoid
// recursion, in particular by going through "ordinary" nodes (that don't
// need to know whether the rest of the match failed) by a loop instead of
// by recursion.

int	CRegExp::regmatch(TCHAR *prog)
{
	TCHAR *scan;	// Current node.
	TCHAR *next;		// Next node.

	for (scan = prog; scan != NULL; scan = next) {
		next = regnext(scan);

		switch (OP(scan)) {
		case BOL:
			if (reginput != regbol)
				return(0);
			break;
		case EOL:
			if (*reginput != _T('\0'))
				return(0);
			break;
		case ANY:
			if (*reginput == _T('\0'))
				return(0);
			reginput++;
			break;
		case EXACTLY: {
			size_t len;
			TCHAR *const opnd = OPERAND(scan);

			// Inline the first character, for speed.
			if (*opnd != *reginput)
				return(0);
			len = _tcslen(opnd);
			if (len > 1 && _tcsncmp(opnd, reginput, len) != 0)
				return(0);
			reginput += len;
			break;
			}
		case ANYOF:
			if (*reginput == _T('\0') ||
					_tcschr(OPERAND(scan), *reginput) == NULL)
				return(0);
			reginput++;
			break;
		case ANYBUT:
			if (*reginput == _T('\0') ||
					_tcschr(OPERAND(scan), *reginput) != NULL)
				return(0);
			reginput++;
			break;
		case NOTHING:
			break;
		case BACK:
			break;
		case OPEN+1: case OPEN+2: case OPEN+3:
		case OPEN+4: case OPEN+5: case OPEN+6:
		case OPEN+7: case OPEN+8: case OPEN+9: {
			const int no = OP(scan) - OPEN;
			TCHAR *const input = reginput;

			if (regmatch(next)) {
				// Don't set startp if some later
				// invocation of the same parentheses
				// already has.

				if (startp[no] == NULL)
					startp[no] = input;
				return(1);
			} else
				return(0);
			break;
			}
		case CLOSE+1: case CLOSE+2: case CLOSE+3:
		case CLOSE+4: case CLOSE+5: case CLOSE+6:
		case CLOSE+7: case CLOSE+8: case CLOSE+9: {
			const int no = OP(scan) - CLOSE;
			TCHAR *const input = reginput;

			if (regmatch(next)) {
				// Don't set endp if some later
				// invocation of the same parentheses
				// already has.

				if (endp[no] == NULL)
					endp[no] = input;
				return(1);
			} else
				return(0);
			break;
			}
		case BRANCH: {
			TCHAR *const save = reginput;

			if (OP(next) != BRANCH)		// No choice.
				next = OPERAND(scan);	// Avoid recursion.
			else {
				while (OP(scan) == BRANCH) {
					if (regmatch(OPERAND(scan)))
						return(1);
					reginput = save;
					scan = regnext(scan);
				}
				return(0);
				// NOTREACHED
			}
			break;
			}
		case STAR:
		case PLUS: {
			const TCHAR nextch =
				(OP(next) == EXACTLY) ? *OPERAND(next) : _T('\0');
			size_t no;
			TCHAR *const save = reginput;
			const size_t min = (OP(scan) == STAR) ? 0 : 1;

			for (no = regrepeat(OPERAND(scan)) + 1; no > min; no--) {
				reginput = save + no - 1;
				// If it could work, try it.
				if (nextch == _T('\0') || *reginput == nextch)
					if (regmatch(next))
						return(1);
			}
			return(0);
			break;
			}
		case END:
			return(1);	// Success!
			break;
		default:
			TRACE0("regexp corruption\n");
			return(0);
			break;
		}
	}

	// We get here only if there's trouble -- normally "case END" is
	// the terminating point.

	TRACE0("corrupted pointers\n");
	return(0);
}


// regrepeat - report how many times something simple would match

size_t CRegExp::regrepeat(TCHAR *node)
{
	size_t count;
	TCHAR *scan;
	TCHAR ch;

	switch (OP(node))
	{
	case ANY:
		return(_tcslen(reginput));
		break;
	case EXACTLY:
		ch = *OPERAND(node);
		count = 0;
		for (scan = reginput; *scan == ch; scan++)
			count++;
		return(count);
		break;
	case ANYOF:
		return(_tcsspn(reginput, OPERAND(node)));
		break;
	case ANYBUT:
		return(_tcscspn(reginput, OPERAND(node)));
		break;
	default:		// Oh dear.  Called inappropriately.
		TRACE0("internal error: bad call of regrepeat\n");
		return(0);	// Best compromise.
		break;
	}
	// NOTREACHED
}

// regnext - dig the "next" pointer out of a node

TCHAR *CRegExp::regnext(TCHAR *p)
{
	const short &offset = *((short*)(p+1));

	if (offset == 0)
		return(NULL);

	return((OP(p) == BACK) ? p-offset : p+offset);
}

// GetReplaceString	- Converts a replace expression to a string
// Returns			- Pointer to newly allocated string
//					  Caller is responsible for deleting it
TCHAR* CRegExp::GetReplaceString( const TCHAR* sReplaceExp )
{
	TCHAR *src = (TCHAR *)sReplaceExp;
	TCHAR *buf;
	TCHAR c;
	int no;
	size_t len;

	if( sReplaceExp == NULL || sFoundText == NULL )
		return NULL;


	// First compute the length of the string
	int replacelen = 0;
	while ((c = *src++) != _T('\0'))
	{
		if (c == _T('&'))
			no = 0;
		else if (c == _T('\\') && isdigit(*src))
			no = *src++ - _T('0');
		else
			no = -1;

		if (no < 0)
		{
			// Ordinary character.
			if (c == _T('\\') && (*src == _T('\\') || *src == _T('&')))
				c = *src++;
			replacelen++;
		}
		else if (startp[no] != NULL && endp[no] != NULL &&
					endp[no] > startp[no])
		{
			// Get tagged expression
			len = endp[no] - startp[no];
			replacelen += len;
		}
	}

	// Now allocate buf
	buf = new TCHAR[replacelen+1];
	if( buf == NULL )
		return NULL;

	TCHAR* sReplaceStr = buf;

	// Add null termination
	buf[replacelen] = _T('\0');

	// Now we can create the string
	src = (TCHAR *)sReplaceExp;
	while ((c = *src++) != _T('\0'))
	{
		if (c == _T('&'))
			no = 0;
		else if (c == _T('\\') && isdigit(*src))
			no = *src++ - _T('0');
		else
			no = -1;

		if (no < 0)
		{
			// Ordinary character.
			if (c == _T('\\') && (*src == _T('\\') || *src == _T('&')))
				c = *src++;
			*buf++ = c;
		}
		else if (startp[no] != NULL && endp[no] != NULL &&
					endp[no] > startp[no])
		{
			// Get tagged expression
			len = endp[no] - startp[no];
			int tagpos = startp[no] - startp[0];

			_tcsncpy(buf, sFoundText + tagpos, len);
			buf += len;
		}
	}

	return sReplaceStr;
}