/*##############################################################################
Copyright (C) 2011 HPCC Systems.
All rights reserved. This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see .
############################################################################## */
#ifndef U_OVERRIDE_CXX_ALLOCATION
#define U_OVERRIDE_CXX_ALLOCATION 0 // Enabling this forces all allocation of ICU objects to ICU's heap, but is incompatible with jmemleak
#endif
#include
#include "jliball.hpp"
#include "eclrtl.hpp"
#include "hqlexpr.hpp"
#include "hqlcerrors.hpp"
#include "hqlutil.hpp"
#include "hqlregex.ipp"
#include "hqlhtcpp.ipp"
#include "hqlcatom.hpp"
#include "hqlcpputil.hpp"
#include "thorrparse.ipp"
#include "hqlthql.hpp"
#include "unicode/uchar.h"
//#define NEW_DFA_CALC
//#define DEFAULT_DFA_COMPLEXITY 0
#define DEFAULT_UNICODE_DFA_COMPLEXITY 0
#define DEFAULT_UTF8_DFA_COMPLEXITY 2000
#define DEFAULT_DFA_COMPLEXITY 10000
inline unsigned addWithoutOverflow(unsigned x, unsigned y)
{
if (UINT_MAX - x <= y)
return UINT_MAX;
return x + y;
}
inline unsigned multiplyWithoutOverflow(unsigned x, unsigned y)
{
if (UINT_MAX / x <= y)
return UINT_MAX;
return x * y;
}
//===========================================================================
static IValue * getCastConstant(IValue * value, type_t tc)
{
switch (tc)
{
case type_unicode:
return value->castTo(unknownUnicodeType);
case type_utf8:
return value->castTo(unknownUtf8Type);
case type_string:
return value->castTo(unknownStringType);
}
throwUnexpected();
}
static HqlRegexExpr * createRegexExpr(const ParseInformation & options, IHqlExpression * expr, bool caseSensitive)
{
switch (expr->getOperator())
{
case no_null:
case no_pat_beginrecursive:
case no_pat_beginpattern:
case no_pat_endpattern:
case no_pat_singlechar:
return new HqlSimpleRegexExpr(options, expr, caseSensitive);
}
return new HqlComplexRegexExpr(options, expr, caseSensitive);
}
static HqlRegexExpr * createRegexExpr(const ParseInformation & options, node_operator op, IHqlExpression * expr, bool caseSensitive)
{
switch (op)
{
case no_null:
case no_pat_beginrecursive:
case no_pat_beginpattern:
case no_pat_endpattern:
case no_pat_singlechar:
return new HqlSimpleRegexExpr(options, op, expr, caseSensitive);
}
return new HqlComplexRegexExpr(options, op, expr, caseSensitive);
}
//---------------------------------------------------------------------------
void HqlRegexHashTable::onAdd(void *et)
{
((HqlRegexExpr*)et)->Link();
}
void HqlRegexHashTable::onRemove(void *et)
{
((HqlRegexExpr*)et)->Release();
}
unsigned HqlRegexHashTable::getHashFromElement(const void *et) const
{
return ((HqlRegexExpr*)et)->getHash();
}
unsigned HqlRegexHashTable::getHashFromFindParam(const void *fp) const
{
return ((HqlRegexExpr*)fp)->getHash();
}
const void * HqlRegexHashTable::getFindParam(const void *et) const
{
return et;
}
bool HqlRegexHashTable::matchesFindParam(const void *et, const void *key, unsigned fphash) const
{
return et == key;
}
void HqlRegexUniqueArray::append(HqlRegexExpr & expr)
{
if (!hash.find(&expr))
{
array.append(expr);
hash.add(&expr);
}
else
{
expr.Release();
}
}
//---------------------------------------------------------------------------
inline unsigned limitedAdd(unsigned a, unsigned b)
{
if (PATTERN_UNLIMITED_LENGTH - a <= b)
return PATTERN_UNLIMITED_LENGTH;
return a+b;
}
inline unsigned limitedMult(unsigned a, unsigned b)
{
if (a == 0)
return 0;
if (PATTERN_UNLIMITED_LENGTH / a <= b)
return PATTERN_UNLIMITED_LENGTH;
return a*b;
}
IHqlExpression * RegexIdAllocator::createKey(IHqlExpression * expr, _ATOM name)
{
if (!expr)
return NULL;
IHqlExpression * body = expr->queryBody();
if (name)
return createSymbol(name, body->getType(), LINK(body));
return LINK(body);
}
void RegexIdAllocator::setID(IHqlExpression * expr, _ATOM name, unsigned value)
{
OwnedHqlExpr key = createKey(expr, name);
map.setValue(key, value);
}
regexid_t RegexIdAllocator::queryID(IHqlExpression * expr, _ATOM name)
{
OwnedHqlExpr key = createKey(expr, name);
regexid_t * match = map.getValue(key);
if (match)
return *match;
map.setValue(key, ++nextId);
return nextId;
}
int compareUnsigned(unsigned * left, unsigned * right)
{
return (*left < *right) ? -1 : (*left > *right) ? +1 : 0;
}
class SymbolArray
{
friend class SymbolArrayIterator;
public:
void addUniqueRange(unsigned low, unsigned high)
{
unsigned max = symbols.ordinality();
unsigned i;
for (i = 0; i < max; i++)
{
unsigned cur = symbols.item(i);
if (cur == low)
low++;
if (cur > low)
{
if (cur > high)
cur = high+1;
while (low != cur)
symbols.add(low++, i++);
low++;
}
if (cur > high)
return;
}
while (low <= high)
symbols.append(low++);
}
inline void addUnique(unsigned value)
{
bool isNew;
symbols.bAdd(value, compareUnsigned, isNew);
}
protected:
UnsignedArray symbols;
};
class SymbolArrayIterator
{
public:
SymbolArrayIterator(SymbolArray & _table) : table(_table) { cur = 0; }
inline bool first()
{
cur = 0; return table.symbols.isItem(cur);
}
inline bool isValid()
{
return table.symbols.isItem(cur);
}
inline unsigned get()
{
return table.symbols.item(cur);
}
inline bool next()
{
cur++; return table.symbols.isItem(cur);
}
protected:
SymbolArray & table;
unsigned cur;
};
//---------------------------------------------------------------------------
HqlNamedRegex::HqlNamedRegex(IHqlExpression * _expr, _ATOM _name, IHqlExpression * _searchExpr, node_operator _kind, bool _caseSensitive, bool _isMatched)
{
kind = _kind;
expr.set(_expr);
searchExpr.set(_searchExpr);
name = _name;
numUses = 1;
isMatched = _isMatched;
isRecursive = false;
doneCalcDfaScore = false;
doneCreateDFA = false;
doneExpandNamed = false;
doneExpandRecursion = false;
doneMarkDfas = false;
noGenerate = (kind == no_pat_dfa);
caseSensitive = _caseSensitive;
}
HqlNamedRegex::~HqlNamedRegex()
{
cleanup();
}
void HqlNamedRegex::addBeginEnd(const ParseInformation & options)
{
//Add nodes at either end of the pattern - at the start as a placeholder for all the possible first patterns,
//and at the end to mark which paths are terminal.
HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
follow->addOperand(createRegexExpr(options, isRecursive ? no_pat_beginrecursive : no_pat_beginpattern, NULL, caseSensitive));
follow->addOperand(LINK(def));
if (isRecursive)
follow->addOperand(createRegexExpr(options, no_pat_endrecursive, NULL, caseSensitive));
follow->addOperand(createRegexExpr(options, no_pat_endpattern, NULL, caseSensitive));
def.setown(follow);
}
void HqlNamedRegex::analyseNullLeadTrail()
{
def->analyseNullLeadTrail();
limit = def->limit;
}
void HqlNamedRegex::calcDfaScore()
{
if (!doneCalcDfaScore)
{
doneCalcDfaScore = true;
def->calcDfaScore();
}
}
void HqlNamedRegex::calculateNext()
{
def->calculateNext();
}
bool HqlNamedRegex::canBeNull()
{
return limit.canBeNull();
}
void HqlNamedRegex::cleanup()
{
if (def)
def->cleanup();
if (created)
created->dispose();
}
void HqlNamedRegex::createDFAs(RegexContext & ctx)
{
if (!doneCreateDFA)
{
doneCreateDFA = true;
def.setown(def->createDFAs(ctx));
}
}
void HqlNamedRegex::expandNamedSymbols()
{
if (!doneExpandNamed)
{
doneExpandNamed = true;
def.setown(def->expandNamedSymbols());
}
}
void HqlNamedRegex::expandRecursion(RegexContext & ctx, HqlNamedRegex * self)
{
if (!doneExpandRecursion)
{
doneExpandRecursion = true;
if (kind == no_pat_instance)
self = this;
def.setown(def->expandRecursion(ctx, self));
}
}
void HqlNamedRegex::generateDFAs()
{
if (!noGenerate)
def->generateDFAs();
}
void HqlNamedRegex::generateRegex(GenerateRegexCtx & ctx)
{
if (created || noGenerate)
return;
node_operator savedKind = ctx.namedKind;
ctx.namedKind = kind;
created.setown(new RegexNamed(name, ctx.idAllocator.queryID(expr, name)));
def->generateRegex(ctx);
def->connectRegex();
HqlRegexExpr * first = def->queryChild(0);
if ((first->getOperator() == no_pat_beginpattern) && (first->following.ordinality() == 1))
first = &first->following.item(0);
created->setFirst(first->created);
ctx.namedKind = savedKind;
}
unsigned HqlNamedRegex::getDfaScore()
{
if (isRecursive)
return NO_DFA_SCORE;
return def->dfaScore;
}
void HqlNamedRegex::insertSeparators(IHqlExpression * separator, RegexContext * ctx)
{
if (expr->queryType()->getTypeCode() == type_pattern)
return;
def.setown(def->insertSeparators(separator, ctx));
}
void HqlNamedRegex::markDFAs(unsigned complexity)
{
if (!doneMarkDfas)
{
doneMarkDfas = true;
def->markDFAs(complexity);
}
}
bool HqlNamedRegex::matchesDefine(IHqlExpression * name, bool _caseSensitive)
{
return (name->queryBody() == defineName) && (caseSensitive == _caseSensitive);
}
void HqlNamedRegex::mergeCreateSets()
{
def.setown(def->mergeCreateSets());
}
bool HqlNamedRegex::queryExpandInline()
{
expandNamedSymbols();
if (!isMatched)
{
if (numUses == 1)
return true;
//expand if trivial - this should be improved once dfa support is implemented.
switch (def->getOperator())
{
case no_pat_instance:
case no_pat_set:
case no_pat_const:
case no_pat_first:
case no_pat_last:
case no_pat_anychar:
return true;
}
}
return false;
}
RegexPattern * HqlNamedRegex::queryRootPattern()
{
//return a reference to the begin
HqlRegexExpr * first = def->queryChild(0);
if ((first->getOperator() == no_pat_beginpattern) && (first->following.ordinality() == 1))
first = &first->following.item(0);
return first->created;
}
void HqlNamedRegex::setRegexOwn(HqlRegexExpr * _def)
{
def.setown(_def);
IHqlExpression * defExpr = def->expr;
if (def->getOperator() == no_define)
defineName.set(defExpr->queryChild(1)->queryBody());
}
//---------------------------------------------------------------------------
static HqlRegexExpr * expandStringAsChars(IHqlExpression * expr, const ParseInformation & options, bool caseSensitive)
{
Owned castValue = getCastConstant(expr->queryChild(0)->queryValue(), options.type);
//convert text strings into a sequence of characters...
ITypeInfo * type = castValue->queryType();
unsigned len = type->getStringLen();
if (len == 0)
return createRegexExpr(options, no_null, NULL, caseSensitive);
HqlRegexExpr * expanded = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
bool readUChar = false;
const void * value = castValue->queryValue();
switch (options.type)
{
case type_unicode:
readUChar = true;
break;
case type_utf8:
//UTF8 is matched as a sequence of characters...
len = rtlUtf8Size(len, value);
break;
}
for (unsigned i = 0; i < len; i++)
{
unsigned nextValue;
if (readUChar)
nextValue = ((UChar *)value)[i];
else
nextValue = ((unsigned char *)value)[i];
OwnedHqlExpr next = getSizetConstant(nextValue);
expanded->addOperand(createRegexExpr(options, no_pat_singlechar, next, caseSensitive));
}
return expanded;
}
//---------------------------------------------------------------------------
HqlRegexExpr::HqlRegexExpr(const ParseInformation & _options, IHqlExpression * _expr, bool _caseSensitive) : options(_options)
{
expr.set(_expr);
op = expr->getOperator();
caseSensitive = _caseSensitive;
init();
}
HqlRegexExpr::HqlRegexExpr(const ParseInformation & _options, node_operator _op, IHqlExpression * _expr, bool _caseSensitive) : options(_options)
{
expr.set(_expr);
op = _op;
caseSensitive = _caseSensitive;
init();
}
HqlRegexExpr::~HqlRegexExpr()
{
::Release(dfa);
}
void HqlRegexExpr::init()
{
uid = (unsigned)(getUniqueId()-options.uidBase);
cleaned = false;
connected = false;
analysed = false;
createDFA = false;
dfaScore = NO_DFA_SCORE;
dfa = NULL;
added = false;
}
void inherit(HqlRegexUniqueArray & target, const HqlRegexUniqueArray & source)
{
ForEachItemIn(idx, source)
target.append(OLINK(source.item(idx)));
}
void HqlRegexExpr::calcDfaScore()
{
ForEachItemIn(idx, args)
args.item(idx).calcDfaScore();
if (querySubPattern())
querySubPattern()->calcDfaScore();
if (queryNamed())
queryNamed()->calcDfaScore();
//on entry dfaScore = NO_DFA_SCORE
switch (getOperator())
{
case no_null:
dfaScore = 0;
break;
case no_pat_const:
dfaScore = expr->queryChild(0)->queryType()->getStringLen();
break;
case no_pat_anychar:
dfaScore = 1;
break;
case no_pat_set:
{
unsigned score = 0;
ForEachChild(idx, expr)
{
IHqlExpression * child = expr->queryChild(idx);
switch (child->getOperator())
{
case no_range:
{
unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
score = addWithoutOverflow(score, (high-low)+1);
break;
}
case no_constant:
score = addWithoutOverflow(score, 1);
break;
}
}
dfaScore = score;
break;
}
case no_pat_repeat:
if (isStandardRepeat())
{
unsigned argScore = args.item(0).dfaScore;
if ((getRepeatMax() > 1) && (argScore != NO_DFA_SCORE))
dfaScore = addWithoutOverflow(argScore, argScore);
else
dfaScore = argScore;
}
else if (!expr->hasProperty(minimalAtom))
{
unsigned max = getRepeatMax();
unsigned namedDfaScore = queryNamed()->getDfaScore();
if (max <= options.dfaRepeatMax)
dfaScore = multiplyWithoutOverflow(max, namedDfaScore);
}
break;
case no_pat_instance:
if (!queryNamed()->isMatched)
dfaScore = queryNamed()->getDfaScore();
break;
case no_pat_or:
{
unsigned totalScore = 0;
ForEachItemIn(idx, args)
{
unsigned score = args.item(idx).dfaScore;
if (score == NO_DFA_SCORE)
{
totalScore = NO_DFA_SCORE;
break;
}
totalScore = addWithoutOverflow(totalScore, score);
}
dfaScore = totalScore;
if (dfaScore == NO_DFA_SCORE)
{
//Work out if there are two or more entires which can be converted into a dfa, and if so combine them
//Also potentially separate simple sting lists into a separate or, so we always generate them
unsigned scoreCount = 0;
unsigned stringListCount = 0;
ForEachItemIn(iCount, args)
{
HqlRegexExpr & cur = args.item(iCount);
if (cur.dfaScore != NO_DFA_SCORE)
{
scoreCount++;
if (cur.isSimpleStringList())
stringListCount++;
}
}
if (scoreCount > 1)
{
HqlRegexExpr * newor = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
HqlRegexExpr * stringListOr = ((stringListCount > 1) && (scoreCount != stringListCount)) ? createRegexExpr(options, no_pat_or, NULL, caseSensitive) : NULL;
unsigned totalScore = 0;
unsigned stringListScore = 0;
for (unsigned i =0; i < args.ordinality(); )
{
HqlRegexExpr & cur = args.item(i);
unsigned score = cur.dfaScore;
if (score != NO_DFA_SCORE)
{
if (stringListOr && cur.isSimpleStringList())
{
stringListScore = addWithoutOverflow(stringListScore, score);
stringListOr->addOperand(&OLINK(cur));
}
else
newor->addOperand(&OLINK(cur));
totalScore = addWithoutOverflow(totalScore, score);
args.remove(i);
}
else
i++;
}
if (stringListOr)
{
stringListOr->dfaScore = stringListScore;
newor->addOperand(stringListOr);
}
newor->dfaScore = totalScore;
//add the dfa first - since it will probably be quickest
args.add(*newor, 0);
}
}
break;
}
case no_pat_follow:
{
//MORE: Should extract applicable subranges out of the sequences
//MORE: This should multiple rather than add.
dfaScore = 0;
ForEachItemIn(idx, args)
{
unsigned score = args.item(idx).dfaScore;
if (score == NO_DFA_SCORE)
{
dfaScore = NO_DFA_SCORE;
break;
}
dfaScore = addWithoutOverflow(dfaScore, score);
}
if (dfaScore == NO_DFA_SCORE)
{
//Try and find ranges that are valid, and extract those into sub-elements.
for (unsigned i =0; i < args.ordinality(); i++)
{
unsigned score = args.item(i).dfaScore;
if (score != NO_DFA_SCORE)
{
unsigned max = args.ordinality();
unsigned j;
for (j=i+1; j < max; j++)
{
unsigned thisScore = args.item(j).dfaScore;
if (thisScore == NO_DFA_SCORE)
break;
score = addWithoutOverflow(score, thisScore);
}
if (j != i+1)
{
HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
for (unsigned k=i; k < j; k++)
follow->addOperand(&OLINK(args.item(k)));
follow->dfaScore = score;
while (j != i)
args.remove(--j);
args.add(*follow, i);
}
}
}
}
break;
}
}
}
void HqlRegexExpr::calculateNext()
{
//Dragon p138
//if a concatenation (a,b), then all the trailing from (a) have leading(b) in their following set
//if a repeat [max > 1] then all positions in leading(x) are also in the following list
//Limited/minimal repeats aren't done the same way - so they aren't added in the same way
//Because I am cheating the order is important - repeats need to be done first since they are greedy.
switch (getOperator())
{
case no_pat_repeat:
if (isStandardRepeat() && getRepeatMax() > 1)
{
//All the trailing elements have the leading elements as possible next.
unsigned max = numTrailing();
for (unsigned idx=0; idx < max; idx++)
gatherLeading(queryTrailing(idx).following);
}
break;
}
ForEachItemIn(idx, args)
args.item(idx).calculateNext();
switch (getOperator())
{
case no_pat_follow:
case no_pat_separator:
{
//Internally connect up following items within the no_pat_follow.
unsigned max = args.ordinality();
assertex(max);
for (unsigned pairs=0; pairsnumTrailing();
for (unsigned trailIdx=0; trailIdxqueryTrailing(trailIdx);
right->gatherLeading(cur.following);
}
if (!right->canBeNull())
break;
}
}
break;
}
}
}
#if 0
//convert strings/unicode to correct kind depending on what is being searched, when the expr is built.
UBool U_CALL_CONV noteUCharRange(const void * context, UChar32 start, UChar limit, UCharCategory type)
{
}
void gatherRange(void * context, UCharCategory type)
{
u_enumCharTypes(noteUCharRange range, context);
}
#endif
static bool canConsume(const ParseInformation & options, unsigned nextChar, unsigned matcherChar, bool caseSensitive)
{
if (nextChar == matcherChar)
return true;
if (matcherChar & RegexSpecialMask)
{
assertex(options.type != type_utf8);
if (!(nextChar & RegexSpecialMask))
{
if (options.type != type_string)
return isUnicodeMatch(matcherChar, nextChar);
else
return isAsciiMatch(matcherChar, nextChar);
}
//Should only occur if unicode.
switch (matcherChar)
{
case RCCalnum:
return (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit);
case RCCalpha:
return (nextChar == RCClower) || (nextChar == RCCupper);
case RCCspace:
return (nextChar == RCCblank);
case RCCprint:
return (nextChar == RCCalnum) || (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit) || (nextChar == RCCpunct) || (nextChar == RCCxdigit) || (nextChar == RCCgraph);
case RCCgraph:
return (nextChar == RCCalnum) || (nextChar == RCCalpha) || (nextChar == RCClower) || (nextChar == RCCupper) || (nextChar == RCCdigit) || (nextChar == RCCpunct) || (nextChar == RCCxdigit);
case RCCxdigit:
return (nextChar == RCCdigit);
case RCCany:
return true;
}
return false;
}
else if (nextChar & RegexSpecialMask)
return false;
if (caseSensitive || (options.type == type_utf8))
return false;
if (options.type == type_unicode)
{
//MORE: This needs improving for full folding.
return u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) == u_foldCase(matcherChar, U_FOLD_CASE_DEFAULT);
}
return (nextChar == tolower(matcherChar)) || (nextChar == toupper(matcherChar));
}
bool HqlRegexExpr::canConsume(unsigned nextChar)
{
switch (getOperator())
{
case no_pat_singlechar:
return ::canConsume(options, nextChar, (unsigned)expr->queryValue()->getIntValue(), caseSensitive);
case no_pat_utf8single:
assertex(!(nextChar & RegexSpecialMask));
return nextChar < 0x80;
case no_pat_utf8lead:
assertex(!(nextChar & RegexSpecialMask));
return nextChar >= 0xc0;
case no_pat_utf8follow:
assertex(!(nextChar & RegexSpecialMask));
return nextChar >= 0x80 && nextChar <= 0xbf;
case no_pat_anychar:
assertex(options.type != type_utf8);
return true;
case no_pat_set:
{
assertex(options.type != type_utf8);
bool invert = expr->hasProperty(notAtom);
ForEachChild(idx, expr)
{
IHqlExpression * child = expr->queryChild(idx);
switch (child->getOperator())
{
case no_range:
{
unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
if (!(nextChar & RegexSpecialMask))
{
if (!caseSensitive)
{
if (options.type == type_unicode)
{
//MORE: Improved unicode
if (u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) >= u_foldCase(low, U_FOLD_CASE_DEFAULT) &&
u_foldCase(nextChar, U_FOLD_CASE_DEFAULT) <= u_foldCase(high, U_FOLD_CASE_DEFAULT))
return !invert;
}
else
{
if (nextChar >= (unsigned)tolower(low) && nextChar <= (unsigned)tolower(high))
return !invert;
if (nextChar >= (unsigned)toupper(low) && nextChar <= (unsigned)toupper(high))
return !invert;
}
}
else
{
if (nextChar >= low && nextChar <= high)
return !invert;
}
}
break;
}
case no_constant:
{
unsigned value = (unsigned)child->queryValue()->getIntValue();
if (::canConsume(options, nextChar, value, caseSensitive))
return !invert;
break;
}
case no_attr:
case no_attr_expr:
case no_attr_link:
break;
default:
UNIMPLEMENTED;
}
}
return invert;
}
case no_pat_endpattern:
return false;
default:
UNIMPLEMENTED;
}
}
void HqlRegexExpr::gatherConsumeSymbols(SymbolArray & symbols)
{
switch (getOperator())
{
case no_pat_singlechar:
{
unsigned charValue = (unsigned)expr->queryValue()->getIntValue();
if (!caseSensitive && (options.type != type_utf8))
{
if (options.type == type_unicode)
{
//MORE: Unicode - can you have several values, what about case conversion?
//MORE: String might need expanding to ('a'|'A')('ss'|'SS'|'B') or something similar.
symbols.addUnique(u_tolower(charValue));
symbols.addUnique(u_toupper(charValue));
}
else
{
symbols.addUnique(tolower(charValue));
symbols.addUnique(toupper(charValue));
}
}
else
symbols.addUnique(charValue);
break;
}
case no_pat_anychar:
assertex(options.type != type_utf8);
if (options.type == type_unicode)
symbols.addUnique(RCCany);
else
symbols.addUniqueRange(0, 255);
return;
case no_pat_endpattern:
return;
case no_pat_set:
assertex(options.type == type_string);
if (options.type == type_unicode)
{
//MORE: Need some kind of implementation for unicode - although invert and ranges may not be possible.
if (expr->hasProperty(notAtom))
throwError(HQLERR_DfaTooComplex);
ForEachChild(idx, expr)
{
IHqlExpression * child = expr->queryChild(idx);
switch (child->getOperator())
{
case no_range:
{
unsigned low = (unsigned)child->queryChild(0)->queryValue()->getIntValue();
unsigned high = (unsigned)child->queryChild(1)->queryValue()->getIntValue();
if (!caseSensitive)
{
//MORE: This really doesn't work very well - I'm not even sure what it means...
symbols.addUniqueRange(u_tolower(low), u_tolower(high));
symbols.addUniqueRange(u_toupper(low), u_toupper(high));
}
else
symbols.addUniqueRange(low, high);
break;
}
case no_constant:
{
unsigned value = (unsigned)child->queryValue()->getIntValue();
if (caseSensitive || (value & RegexSpecialMask))
symbols.addUnique(value);
else
{
symbols.addUnique(u_tolower(value));
symbols.addUnique(u_toupper(value));
}
break;
}
case no_attr:
case no_attr_expr:
case no_attr_link:
break;
default:
UNIMPLEMENTED;
}
}
}
else
{
//This is probably the easiest way to cope with inverted sets and all other complications.
for (unsigned i = 0; i < 256; i++)
if (canConsume(i))
symbols.addUnique(i);
}
return;
case no_pat_utf8single:
{
for (unsigned i = 0; i < 0x80; i++)
symbols.addUnique(i);
break;
}
case no_pat_utf8lead:
{
for (unsigned i = 0xc0; i < 0x100; i++)
symbols.addUnique(i);
break;
}
case no_pat_utf8follow:
{
for (unsigned i = 0x80; i < 0xc0; i++)
symbols.addUnique(i);
break;
}
default:
throwError1(HQLERR_BadDfaOperator, getOpString(getOperator()));
}
}
void HqlRegexExpr::cleanup()
{
if (cleaned)
return;
cleaned = true;
//kill anything that could have a loop.
ForEachItemIn(i0, args)
args.item(i0).cleanup();
if (querySubPattern())
querySubPattern()->cleanup();
if (created)
created->dispose();
following.kill();
args.kill();
}
HqlRegexExpr * HqlRegexExpr::clone()
{
assertex(!created && !analysed);
HqlRegexExpr * cloned = createRegexExpr(options, op, expr, caseSensitive);
ForEachItemIn(idx, args)
cloned->args.append(*args.item(idx).clone());
cloned->setNamed(queryNamed());
cloned->setSubPattern(querySubPattern());
return cloned;
}
void HqlRegexExpr::connectRegex()
{
if (connected)
return;
connected = true;
ForEachItemIn(idx, args)
args.item(idx).connectRegex();
if (created)
{
ForEachItemIn(idx2, following)
created->addLink(following.item(idx2).created);
HqlNamedRegex * named = queryNamed();
if (named && named->created && !named->noGenerate)
created->setBody(named->created);
if (querySubPattern())
created->setSubPattern(querySubPattern()->queryRootPattern());
}
}
HqlRegexExpr * HqlRegexExpr::createDFAs(RegexContext & ctx)
{
if (createDFA)
{
OwnedHqlExpr searchExpr = createValue(no_pat_dfa, makePatternType(), createUniqueId());
HqlNamedRegex * newNamed = new HqlNamedRegex(searchExpr, NULL, searchExpr, no_pat_dfa, caseSensitive, false);
ctx.named.append(*newNamed);
newNamed->setRegexOwn(expandAsDFA());
HqlRegexExpr * dfa = createRegexExpr(options, no_pat_dfa, NULL, caseSensitive);
dfa->setNamed(newNamed);
dfa->dfaScore = dfaScore;
//MORE: Need to save the original as a *named/subPattern* attribute in case conversion to a DFA fails
return dfa;
}
unsigned max = args.ordinality();
unsigned idx;
for (idx = 0; idx < max; idx++)
{
HqlRegexExpr & cur = args.item(idx);
HqlRegexExpr * transformed = cur.createDFAs(ctx);
args.replace(*transformed, idx);
if (transformed->getOperator() == no_pat_dfa)
{
if ((idx != 0) && (idx+1 != max) && (getOperator() == no_pat_follow))
{
if ((args.item(idx-1).getOperator() == no_pat_begintoken) &&
(args.item(idx+1).getOperator() == no_pat_endtoken))
{
transformed->addOperand(&OLINK(args.item(idx-1)));
args.remove(idx+1);
args.remove(idx-1);
idx--;
max -= 2;
}
}
}
}
if (querySubPattern())
querySubPattern()->createDFAs(ctx);
if (queryNamed())
queryNamed()->createDFAs(ctx);
return LINK(this);
}
HqlRegexExpr * HqlRegexExpr::expandAsRepeatedDFA(unsigned count)
{
if (count == 0)
return createRegexExpr(options, no_null, NULL, caseSensitive);
if (count == 1)
return expandAsDFA();
HqlRegexExpr * cloned = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
for (unsigned i=0; i < count; i++)
cloned->args.append(*expandAsDFA());
return cloned;
}
HqlRegexExpr * HqlRegexExpr::expandAsDFA()
{
assertex(!created && !analysed);
switch (getOperator())
{
case no_pat_const:
return expandStringAsChars(expr, options, caseSensitive);
case no_pat_instance:
return queryNamed()->def->expandAsDFA();
case no_pat_or:
case no_pat_follow:
case no_null:
case no_pat_singlechar:
break;
case no_pat_anychar:
if (options.type == type_utf8)
{
// convert to low | follow(lead, tail+);
Owned orRegexExpr = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
orRegexExpr->addOperand(createRegexExpr(options, no_pat_utf8single, NULL, caseSensitive));
Owned followRegexExpr = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
followRegexExpr->addOperand(createRegexExpr(options, no_pat_utf8lead, NULL, caseSensitive));
OwnedHqlExpr trailExpr = createValue(no_pat_utf8follow, makePatternType());
Owned trailRegexExpr = createRegexExpr(options, no_pat_utf8follow, NULL, caseSensitive);
OwnedHqlExpr repeatExpr = createValue(no_pat_repeat, makePatternType(), LINK(trailExpr), createConstantOne(), createValue(no_any, makeNullType()));
Owned repeatRegexExpr = createRegexExpr(options, no_pat_repeat, repeatExpr, caseSensitive);
repeatRegexExpr->addOperand(trailRegexExpr.getClear());
followRegexExpr->addOperand(repeatRegexExpr.getClear());
orRegexExpr->addOperand(followRegexExpr.getClear());
return orRegexExpr.getClear();
}
break;
case no_pat_set:
if (options.type == type_utf8)
{
throwUnexpected();
//expand set values...
// return low | follow(lead, tail+);
}
break;
case no_pat_dfa:
return queryNamed()->def->expandAsDFA();
case no_pat_repeat:
{
if (isStandardRepeat())
break;
unsigned min = getRepeatMin();
unsigned max = getRepeatMax();
if (min == max)
return queryNamed()->def->expandAsRepeatedDFA(min);
HqlRegexExpr * cloned = createRegexExpr(options, no_pat_or, NULL, caseSensitive);
for (unsigned i=min; i <= max; i++)
cloned->args.append(*queryNamed()->def->expandAsRepeatedDFA(i));
return cloned;
}
default:
UNIMPLEMENTED;
}
HqlRegexExpr * cloned = createRegexExpr(options, op, expr, caseSensitive);
ForEachItemIn(idx, args)
cloned->args.append(*args.item(idx).expandAsDFA());
assertex(!queryNamed());
assertex(!querySubPattern());
return cloned;
}
HqlRegexExpr * HqlRegexExpr::expandNamedSymbols()
{
ForEachItemIn(idx, args)
args.replace(*args.item(idx).expandNamedSymbols(), idx);
if (querySubPattern())
querySubPattern()->expandNamedSymbols();
if (queryNamed())
queryNamed()->expandNamedSymbols();
switch (getOperator())
{
case no_pat_instance:
if (queryNamed()->queryExpandInline())
{
queryNamed()->removeUse();
return queryNamed()->def->clone();
}
break;
case no_pat_follow:
{
ForEachItemIn(idx, args)
{
node_operator op = args.item(idx).getOperator();
switch (op)
{
//Remove begin_token/end_token around the following, otherwise
//we might get too many separators which may be non-optional
case no_pat_first:
case no_pat_last:
case no_penalty:
case no_pat_x_before_y:
case no_pat_before_y:
case no_pat_x_after_y:
case no_pat_after_y:
case no_pat_guard:
if ((idx > 0) && args.item(idx-1).getOperator() == no_pat_begintoken)
{
assertex(args.item(idx+1).getOperator() == no_pat_endtoken);
args.remove(idx+1);
args.remove(idx-1);
if (args.isItem(idx) && (args.item(idx).getOperator() == no_pat_separator))
args.remove(idx);
}
break;
}
}
break;
}
}
return LINK(this);
}
HqlRegexExpr * HqlRegexExpr::expandRecursion(RegexContext & ctx, HqlNamedRegex * self)
{
ForEachItemIn(idx, args)
args.replace(*args.item(idx).expandRecursion(ctx, self), idx);
if (querySubPattern())
querySubPattern()->expandRecursion(ctx, self);
if (queryNamed())
queryNamed()->expandRecursion(ctx, self);
HqlNamedRegex * replacement = NULL;
switch (getOperator())
{
case no_self:
replacement = self;
break;
case no_pat_use:
assertex(expr->queryChild(0)->queryValue());
replacement = ctx.queryDefine(expr->queryChild(0), caseSensitive);
break;
case no_define:
return LINK(&args.item(0));
}
if (replacement)
{
replacement->expandRecursion(ctx, self);
replacement->addRecursiveUse();
HqlRegexExpr * instance = createRegexExpr(options, no_pat_instance, NULL, caseSensitive);
instance->setNamed(replacement);
return instance;
}
return LINK(this);
}
void HqlRegexExpr::generateDFA(IDfaPattern * pattern)
{
//Estimate the number of transitions to avoid lots of reallocs.
unsigned numTransitions = 0;
{
ForEachItemIn(idx, dfa->states)
{
HqlDfaState & cur = dfa->states.item(idx);
unsigned min = 255;
unsigned max = 0;
ForEachItemIn(idx2, cur.transitions)
{
HqlDfaTransition & curTransition = cur.transitions.item(idx2);
if (curTransition.code < min) min = curTransition.code;
if (curTransition.code > max) max = curTransition.code;
}
if (min <= max)
numTransitions += (max-min)+1;
}
}
pattern->init(dfa->states.ordinality(), numTransitions);
ForEachItemIn(idx, dfa->states)
{
HqlDfaState & cur = dfa->states.item(idx);
assertex(cur.id == idx);
pattern->beginState(cur.id);
ForEachItemIn(idx, cur.position)
{
HqlRegexExpr & curExpr = cur.position.item(idx);
if (curExpr.getOperator() == no_pat_endpattern)
pattern->setStateAccept(curExpr.getAcceptId());
}
ForEachItemIn(idx2, cur.transitions)
{
HqlDfaTransition & curTransition = cur.transitions.item(idx2);
pattern->addTransition(curTransition.code, curTransition.next->id);
}
pattern->endState();
}
pattern->finished();
}
void HqlRegexExpr::generateRegex(GenerateRegexCtx & ctx)
{
ForEachItemIn(idx, args)
args.item(idx).generateRegex(ctx);
assertex(!created);
if (queryNamed())
queryNamed()->generateRegex(ctx);
if (querySubPattern())
querySubPattern()->generateRegex(ctx);
switch (getOperator())
{
case no_null:
case no_pat_beginpattern:
created.setown(new RegexNullPattern);
break;
case no_pat_beginrecursive:
created.setown(new RegexRecursivePattern);
break;
case no_pat_beginseparator:
created.setown(new RegexBeginSeparatorPattern);
break;
case no_pat_endseparator:
created.setown(new RegexEndSeparatorPattern);
break;
case no_pat_endrecursive:
created.setown(new RegexEndRecursivePattern);
break;
case no_pat_instance:
created.setown(new RegexNamedPattern);
break;
case no_pat_begincheck:
case no_pat_beginvalidate:
created.setown(new RegexBeginCheckPattern);
break;
case no_pat_endcheckin:
{
bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
created.setown(new RegexCheckInPattern(expr->hasProperty(notAtom), stripSeparator));
break;
}
case no_pat_endchecklength:
{
unsigned minLength = 0;
unsigned maxLength = PATTERN_UNLIMITED_LENGTH;
getCheckRange(expr->queryChild(1), minLength, maxLength, ctx.info.charSize);
bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
created.setown(new RegexCheckLengthPattern(expr->hasProperty(notAtom), stripSeparator, minLength, maxLength));
break;
}
case no_pat_endvalidate:
{
unsigned idx = ctx.regex.getValidatorIndex(expr);
assertex(idx != NotFound);
ValidateKind validatorKind = getValidateKind(ctx.regex.queryValidateExpr(expr));
bool stripSeparator = ctx.info.addedSeparators && (expr->queryType()->getTypeCode() != type_pattern);
switch (ctx.info.inputFormat())
{
case NlpUnicode:
if (validatorKind != ValidateIsString)
created.setown(new RegexValidateUniAsUniPattern(stripSeparator, idx));
else
created.setown(new RegexValidateUniAsAscPattern(stripSeparator, idx));
break;
case NlpUtf8:
if (validatorKind != ValidateIsString)
created.setown(new RegexValidateUtf8AsUniPattern(stripSeparator, idx));
else
created.setown(new RegexValidateUtf8AsAscPattern(stripSeparator, idx));
break;
case NlpAscii:
if (validatorKind != ValidateIsUnicode)
created.setown(new RegexValidateAscAsAscPattern(stripSeparator, idx));
else
created.setown(new RegexValidateAscAsUniPattern(stripSeparator, idx));
break;
}
break;
}
case no_pat_begintoken:
created.setown(new RegexBeginTokenPattern);
break;
case no_pat_endtoken:
created.setown(new RegexEndTokenPattern);
break;
case no_pat_x_before_y:
case no_pat_before_y:
created.setown(new RegexAssertNextPattern(expr->hasProperty(notAtom)));
break;
case no_pat_x_after_y:
case no_pat_after_y:
{
unsigned minSize = limitedMult(querySubPattern()->limit.minLength, ctx.info.charSize);
unsigned maxSize = limitedMult(querySubPattern()->limit.maxLength, ctx.info.charSize);
created.setown(new RegexAssertPrevPattern(expr->hasProperty(notAtom), minSize, maxSize));
break;
}
case no_pat_first:
created.setown(new RegexStartPattern);
break;
case no_pat_last:
created.setown(new RegexFinishPattern);
break;
case no_pat_anychar:
created.setown(new RegexAnyCharPattern);
break;
case no_penalty:
{
int penalty = 1;
if (expr->numChildren() > 0)
penalty = (int)expr->queryChild(0)->queryValue()->getIntValue();
created.setown(new RegexPenaltyPattern(penalty));
}
break;
case no_pat_set:
{
RegexSetBasePattern * pattern;
if (ctx.info.type != type_string)
pattern = new RegexUnicodeSetPattern(caseSensitive);
else if (caseSensitive)
pattern = new RegexAsciiSetPattern;
else
pattern = new RegexAsciiISetPattern;
created.setown(pattern);
if (expr->hasProperty(notAtom))
pattern->setInvert(true);
ForEachChild(idx, expr)
{
IHqlExpression * child = expr->queryChild(idx);
switch (child->getOperator())
{
case no_range:
pattern->addRange((unsigned)child->queryChild(0)->queryValue()->getIntValue(), (unsigned)child->queryChild(1)->queryValue()->getIntValue());
break;
case no_constant:
pattern->addRange((unsigned)child->queryValue()->getIntValue(), (unsigned)child->queryValue()->getIntValue());
break;
case no_attr:
case no_attr_expr:
case no_attr_link:
break;
default:
UNIMPLEMENTED;
}
}
break;
}
case no_pat_const:
{
IValue * value = expr->queryChild(0)->queryValue();
Owned castValue = getCastConstant(value, ctx.info.type);
unsigned len = castValue->queryType()->getStringLen();
switch (ctx.info.type)
{
case type_unicode:
{
const UChar * data = (const UChar *)castValue->queryValue();
if (caseSensitive)
created.setown(new RegexUnicodePattern(len, data));
else
created.setown(new RegexUnicodeIPattern(len, data));
break;
}
case type_utf8:
{
const char * data = (const char *)castValue->queryValue();
if (caseSensitive)
created.setown(new RegexUtf8Pattern(len, data));
else
created.setown(new RegexUtf8IPattern(len, data));
break;
}
case type_string:
{
const char * data = (const char *)castValue->queryValue();
if (caseSensitive)
created.setown(new RegexAsciiPattern(len, data));
else
created.setown(new RegexAsciiIPattern(len, data));
break;
}
}
break;
}
case no_pat_or:
case no_pat_follow:
case no_pat_separator:
//Nothing is generated for these....
break;
case no_pat_dfa:
{
RegexDfaPattern * pattern;
bool isToken = false;
HqlRegexExpr * child = queryChild(0);
if (child && child->getOperator() == no_pat_begintoken)
isToken = true;
if (options.type == type_unicode)
{
throwUnexpected();
pattern = new RegexUnicodeDfaPattern;
}
else
pattern = new RegexAsciiDfaPattern(isToken);
created.setown(pattern);
Owned builder = pattern->createBuilder();
generateDFA(builder);
}
break;
case no_pat_repeat:
if (!isStandardRepeat())
{
if (expr->queryChild(0)->getOperator() == no_pat_anychar)
{
created.setown(new RegexRepeatAnyPattern(getRepeatMin(), getRepeatMax(), expr->hasProperty(minimalAtom)));
queryNamed()->noGenerate = true;
}
else
created.setown(new RegexRepeatPattern(getRepeatMin(), getRepeatMax(), !expr->hasProperty(minimalAtom)));
}
break;
case no_pat_endpattern:
switch (ctx.namedKind)
{
case no_pat_instance:
case no_pat_repeat:
created.setown(new RegexEndNestedPattern);
break;
case no_pat_checkin:
created.setown(new RegexDonePattern);
break;
case no_parse:
created.setown(new RegexDonePattern);
break;
default:
UNIMPLEMENTED;
}
break;
default:
UNIMPLEMENTED;
}
}
unsigned HqlRegexExpr::getAcceptId()
{
//Overloaded, and a bit nasty, but will do for the moment....
if (dfaScore == NO_DFA_SCORE)
return 99;
return dfaScore;
}
unsigned HqlRegexExpr::getRepeatMin()
{
return ::getRepeatMin(expr);
}
unsigned HqlRegexExpr::getRepeatMax()
{
return ::getRepeatMax(expr);
}
bool HqlRegexExpr::isStandardRepeat()
{
if (::isStandardRepeat(expr))
{
if (options.expandRepeatAnyAsDfa || expr->queryChild(0)->getOperator() != no_pat_anychar)
return true;
}
return false;
}
HqlRegexExpr * HqlRegexExpr::insertSeparators(IHqlExpression * separator, RegexContext * ctx)
{
switch (getOperator())
{
case no_pat_follow:
{
ForEachItemInRev(idx, args)
{
HqlRegexExpr & cur = args.item(idx);
if (cur.getOperator() == no_pat_endtoken)
{
if (args.item(idx-1).getOperator() != no_pat_last)
{
//Separators are slightly weird. Because they are implicit, the whole thing should
//be optional (unlike a name where just the contents are optional).
//Should probably remove the optionality from the separator's value, and
//put it on the surrounding separator instead.
HqlRegexExpr * sepRegex = createRegexExpr(options, no_pat_separator, NULL, caseSensitive);
sepRegex->addOperand(createRegexExpr(options, no_pat_beginseparator, NULL, caseSensitive));
sepRegex->addOperand(ctx->createStructure(separator, ctx->isCaseSensitive()));
sepRegex->addOperand(createRegexExpr(options, no_pat_endseparator, NULL, caseSensitive));
args.add(*sepRegex, idx+1);
}
}
}
break;
}
}
ForEachChild(idx, this)
{
replaceOperand(queryChild(idx)->insertSeparators(separator, ctx), idx);
}
return LINK(this);
}
bool HqlRegexExpr::isWorthConvertingToDFA()
{
switch (getOperator())
{
case no_pat_or: case no_pat_repeat:
case no_pat_follow:
return true;
}
ForEachItemIn(idx, args)
if (args.item(idx).isWorthConvertingToDFA())
return true;
return false;
}
bool HqlRegexExpr::isSimpleStringList()
{
switch (getOperator())
{
case no_pat_or:
case no_pat_case:
case no_pat_nocase:
break;
case no_pat_const:
return true;
default:
return false;
}
ForEachItemIn(idx, args)
if (!args.item(idx).isSimpleStringList())
return false;
return true;
}
void HqlRegexExpr::markDFAs(unsigned complexity)
{
if (dfaScore != NO_DFA_SCORE)
{
//Don't convert instances - convert the definition so it can be reused.
//and if not worth converting, then children won't be either.
if (getOperator() != no_pat_instance)
{
if (dfaScore <= complexity)
{
createDFA = isWorthConvertingToDFA();
return;
}
else if (isSimpleStringList())
{
createDFA = true;
return;
}
}
}
ForEachItemIn(idx, args)
args.item(idx).markDFAs(complexity);
if (queryNamed())
queryNamed()->markDFAs(complexity);
if (querySubPattern())
querySubPattern()->markDFAs(complexity);
}
HqlRegexExpr * HqlRegexExpr::mergeCreateSets()
{
ForEachItemIn(idx, args)
replaceOperand(args.item(idx).mergeCreateSets(), idx);
if (getOperator() == no_pat_or)
{
unsigned singleCount = 0;
unsigned setCount = 0;
unsigned nullCount = 0;
HqlExprArray setArgs;
HqlRegexExprArray newArgs;
ForEachItemIn(idx, args)
{
HqlRegexExpr * cur = &args.item(idx);
bool curIsOptional = false;
if ((cur->getOperator() == no_pat_repeat) && (cur->getRepeatMin() == 0) && (cur->getRepeatMax() == 1))
{
curIsOptional = true;
cur = cur->queryChild(0);
}
if (cur->caseSensitive == caseSensitive)
{
switch (cur->getOperator())
{
case no_null:
nullCount++;
break;
case no_pat_set:
{
setCount++;
ForEachChild(iset, cur->expr)
{
IHqlExpression * child = cur->expr->queryChild(iset);
if (setArgs.find(*child) == NotFound)
setArgs.append(*LINK(child));
}
if (curIsOptional) nullCount++;
break;
}
case no_pat_const:
{
ITypeInfo * type = cur->expr->queryChild(0)->queryType();
unsigned len = type->getStringLen();
if (len == 0)
nullCount++;
else if (len == 1)
{
const void * value = cur->expr->queryChild(0)->queryValue()->queryValue();
unsigned nextValue;
if (type->getTypeCode() == type_utf8)
nextValue = rtlUtf8Char(value);
else if (isUnicodeType(type))
nextValue = *(UChar *)value;
else
nextValue = *(unsigned char *)value;
OwnedHqlExpr nextExpr = createConstant((int)nextValue);
if (setArgs.find(*nextExpr) == NotFound)
setArgs.append(*nextExpr.getClear());
singleCount++;
if (curIsOptional) nullCount++;
}
else
newArgs.append(*LINK(cur));
break;
}
default:
newArgs.append(*LINK(cur));
break;
}
}
else
newArgs.append(*LINK(cur));
}
if ((setCount > 1) || (setCount == 1 && singleCount > 0) || (singleCount > 1))
{
OwnedHqlExpr newSetExpr = createValue(no_pat_set, makePatternType(), setArgs);
if (nullCount)
newSetExpr.setown(createValue(no_pat_repeat, makePatternType(), newSetExpr.getClear(), createConstant(0), createConstantOne()));
HqlRegexExpr * newSetRegex = createRegexExpr(options, newSetExpr, caseSensitive);
if (newArgs.ordinality() == 0)
return newSetRegex;
args.kill();
ForEachItemIn(idx, newArgs)
args.append(OLINK(newArgs.item(idx)));
args.append(*newSetRegex);
}
}
return LINK(this);
}
#if 0
HqlRegexExpr * HqlRegexExpr::removeTrivialInstances()
{
node_operator op = expr->getOperator();
switch (op)
{
case no_pat_instance:
{
IHqlExpression * def = expr->queryChild(0);
if (!queryNamed()->isMatched && def->getOperator() == no_pat_instance)
{
queryNamed()->removeUse();
return
#if 0
//Do this later - recursion means we can't do it yet
//Optimization whilst building the tree. If a named symbol is just a definition of another named symbol
//and this named symbol isn't matched explicitly, then expand it.
if (!isMatched(def) && def->getOperator() == no_pat_instance)
return createStructure(def);
else
}
}
}
}
#endif
#endif
//---------------------------------------------------------------------------
void HqlSimpleRegexExpr::analyseNullLeadTrail()
{
if (analysed)
return;
analysed = true;
switch (getOperator())
{
case no_null:
limit.minLength = 0;
limit.maxLength = 0;
break;
case no_pat_beginrecursive:
limit.minLength = 0;
limit.maxLength = 0;
limit.containsAssertion = true;
break;
case no_pat_beginpattern:
limit.minLength = 0;
limit.maxLength = 0;
break;
case no_pat_endpattern:
limit.minLength = 0;
limit.maxLength = 0;
limit.containsAssertion = true;
break;
case no_pat_singlechar:
limit.minLength = 1;
limit.maxLength = 1;
break;
}
}
void HqlSimpleRegexExpr::gatherLeading(HqlRegexUniqueArray & target)
{
switch (op)
{
case no_null:
case no_pat_beginrecursive:
case no_pat_beginpattern:
break;
case no_pat_endpattern:
case no_pat_singlechar:
target.append(OLINK(*this));
break;
default:
UNIMPLEMENTED;
}
}
void HqlSimpleRegexExpr::gatherTrailing(HqlRegexUniqueArray & target)
{
unsigned max = numTrailing();
for (unsigned i = 0; i < max; i++)
target.append(OLINK(queryTrailing(i)));
}
unsigned HqlSimpleRegexExpr::numTrailing() const
{
switch (getOperator())
{
case no_null:
case no_pat_endpattern:
return 0;
case no_pat_beginrecursive:
case no_pat_beginpattern:
case no_pat_singlechar:
return 1;
default:
UNIMPLEMENTED;
}
}
HqlRegexExpr & HqlSimpleRegexExpr::queryTrailing(unsigned idx)
{
switch (getOperator())
{
case no_pat_beginrecursive:
case no_pat_beginpattern:
case no_pat_singlechar:
if (idx == 0)
return *this;
break;
}
UNIMPLEMENTED;
}
//---------------------------------------------------------------------------
void HqlComplexRegexExpr::analyseNullLeadTrail()
{
if (analysed)
return;
analysed = true;
ForEachItemIn(idx, args)
args.item(idx).analyseNullLeadTrail();
if (queryNamed())
named->analyseNullLeadTrail();
switch (getOperator())
{
case no_null:
case no_pat_beginrecursive:
case no_pat_beginpattern:
case no_pat_endpattern:
case no_pat_singlechar:
//Should be handled by simpleRegex
UNIMPLEMENTED;
break;
case no_pat_instance:
limit = named->queryLimit();
limit.containsAssertion = true; // require us to walk it, even if internally matches nothing.
leading.append(*LINK(this));
trailing.append(*LINK(this));
break;
case no_pat_dfa:
limit = named->queryLimit();
limit.containsAssertion = true; // this means that it will process optional values
leading.append(*LINK(this));
trailing.append(*LINK(this));
break;
case no_pat_endrecursive:
case no_pat_x_before_y:
case no_pat_before_y:
case no_pat_x_after_y:
case no_pat_after_y:
case no_pat_first:
case no_pat_last:
case no_pat_begintoken:
case no_pat_endtoken:
case no_pat_begincheck:
case no_pat_endcheckin:
case no_pat_endchecklength:
case no_pat_beginseparator:
case no_pat_endseparator:
case no_pat_beginvalidate:
case no_pat_endvalidate:
case no_penalty:
limit.containsAssertion = true;
limit.minLength = 0;
limit.maxLength = 0;
leading.append(*LINK(this));
trailing.append(*LINK(this));
break;
case no_pat_anychar:
case no_pat_set:
limit.minLength = 1;
limit.maxLength = 1;
leading.append(*LINK(this));
trailing.append(*LINK(this));
break;
case no_pat_const:
{
unsigned len = expr->queryChild(0)->queryType()->getStringLen();
limit.minLength = len;
limit.maxLength = len;
if (len != 0)
{
leading.append(*LINK(this));
trailing.append(*LINK(this));
}
}
break;
case no_pat_or:
{
limit.minLength = PATTERN_UNLIMITED_LENGTH;
limit.maxLength = 0;
limit.containsAssertion = true;
ForEachItemIn(idx, args)
{
HqlRegexExpr & cur = args.item(idx);
if (cur.limit.minLength < limit.minLength)
limit.minLength = cur.limit.minLength;
if (cur.limit.maxLength > limit.maxLength)
limit.maxLength = cur.limit.maxLength;
if (cur.limit.canBeNull())
limit.containsAssertion = false;
cur.gatherLeading(leading);
cur.gatherTrailing(trailing);
}
break;
}
case no_pat_separator:
{
assertex(args.ordinality() == 3);
limit = args.item(1).limit;
leading.append(OLINK(args.item(0)));
trailing.append(OLINK(args.item(2)));
break;
}
case no_pat_follow:
{
limit.minLength = 0;
limit.maxLength = 0;
ForEachItemIn(idx0, args)
{
HqlRegexExpr & cur = args.item(idx0);
limit.minLength = limitedAdd(limit.minLength, cur.limit.minLength);
limit.maxLength = limitedAdd(limit.maxLength, cur.limit.maxLength);
if (cur.limit.containsAssertion)
limit.containsAssertion = true;
}
ForEachItemIn(idx1, args)
{
HqlRegexExpr & cur = args.item(idx1);
cur.gatherLeading(leading);
if (!cur.canBeNull())
break;
}
ForEachItemInRev(idx2, args)
{
HqlRegexExpr & cur = args.item(idx2);
cur.gatherTrailing(trailing);
if (!cur.canBeNull())
break;
}
break;
}
case no_pat_repeat:
{
if (isStandardRepeat())
{
HqlRegexExpr & arg = args.item(0);
limit.minLength = limitedMult(getRepeatMin(), arg.limit.minLength);
limit.maxLength = limitedMult(getRepeatMax(), arg.limit.maxLength);
if (getRepeatMin() != 0) limit.containsAssertion = arg.limit.containsAssertion;
arg.gatherLeading(leading);
arg.gatherTrailing(trailing);
}
else
{
limit.minLength = limitedMult(getRepeatMin(), named->getMinLength());
limit.maxLength = limitedMult(getRepeatMax(), named->getMaxLength());
//if (getRepeatMin() != 0) limit.containsAssertion = named->queryLimit().containsAssertion;
limit.containsAssertion = true;
leading.append(*LINK(this));
trailing.append(*LINK(this));
}
break;
}
case no_pat_pattern:
// Should have been converted by the time we get here...
default:
UNIMPLEMENTED;
break;
}
}
void HqlComplexRegexExpr::cleanup()
{
if (cleaned)
return;
HqlRegexExpr::cleanup();
trailing.kill();
leading.kill();
subPattern.clear();
}
void HqlComplexRegexExpr::gatherLeading(HqlRegexUniqueArray & target)
{
inherit(target, leading);
}
void HqlComplexRegexExpr::gatherTrailing(HqlRegexUniqueArray & target)
{
inherit(target, trailing);
}
//---------------------------------------------------------------------------
// DFA helper classes.
inline int compareHqlRegexExpr(HqlRegexExpr * left, HqlRegexExpr * right)
{
unique_id_t idl = left->queryUid();
unique_id_t idr = right->queryUid();
return (idl < idr) ? -1 : (idl > idr) ? +1 : 0;
}
int compareHqlRegexExpr(CInterface * * left, CInterface * * right)
{
return compareHqlRegexExpr((HqlRegexExpr *)*left, (HqlRegexExpr *)*right);
}
void HqlRegexExprSet::add(HqlRegexExpr * value)
{
bool isNew;
CInterface * castValue = value;
values.bAdd(castValue, compareHqlRegexExpr, isNew);
}
int HqlRegexExprSet::compare(const HqlRegexExprSet & other) const
{
unsigned numThis = values.ordinality();
unsigned numOther = other.values.ordinality();
unsigned numCommon = std::min(numThis, numOther);
for (unsigned i = 0; i < numCommon; i++)
{
HqlRegexExpr & left = values.item(i);
HqlRegexExpr & right = other.values.item(i);
int c = compareHqlRegexExpr(&left, &right);
if (c)
return c;
}
if (numThis > numOther)
return +1;
else if (numThis < numOther)
return -1;
else
return 0;
}
bool HqlRegexExprSet::equals(const HqlRegexExprSet & other) const
{
unsigned numThis = values.ordinality();
unsigned numOther = other.values.ordinality();
if (numThis != numOther)
return false;
for (unsigned i = 0; i < numThis; i++)
{
if (&values.item(i) != &other.values.item(i))
return false;
}
return true;
}
bool HqlDfaState::isAccepting()
{
ForEachItemIn(idx, position)
if (position.item(idx).getOperator() == no_pat_endpattern)
return true;
return false;
}
static inline int compareState(HqlDfaState * left, HqlDfaState * right)
{
return left->position.compare(right->position);
}
static int compareState(CInterface * * left, CInterface * * right)
{
return compareState((HqlDfaState *)*left, (HqlDfaState *)*right);
}
void HqlRegexExpr::generateDFAs()
{
if (op != no_pat_dfa)
{
ForEachItemIn(idx, args)
args.item(idx).generateDFAs();
return;
}
//Adapted from Dragon p141 Fig 3.44
//Main variation is that the set of potential symbols is calculated first, rather than trying the whole alphabet
//it might also have character-classes e.g., [[:alpha:]] in the stream.
dfa = new HqlDfaInfo;
HqlDfaState * firstState = new HqlDfaState(0);
HqlRegexExpr * first = queryNamed()->def->queryChild(0);
ForEachItemIn(idx, first->following)
firstState->position.add(&first->following.item(idx));
//Store a list of states, and an ordered list. First marks which have been processed,
//second gives efficient duplicate detection.
dfa->states.append(*firstState);
dfa->orderedStates.append(*LINK(firstState));
unsigned curStateIndex = 0;
while (curStateIndex < dfa->states.ordinality())
{
HqlDfaState & curState = dfa->states.item(curStateIndex++);
//First gather the potential symbols that come next (otherwise we would die with unicode!)
//it also speeds up ascii by a fairly large factor on sets of strings.
SymbolArray nextSymbols;
ForEachItemIn(ip, curState.position)
curState.position.item(ip).gatherConsumeSymbols(nextSymbols);
//For each symbol, work out what combination of states we could become translated to.
SymbolArrayIterator iter(nextSymbols);
ForEach(iter)
{
unsigned curSymbol = iter.get();
HqlDfaState * nextState = new HqlDfaState(dfa->states.ordinality());
//For each NFA state we could be in, what new NFA state could be now be in...
ForEachItemIn(ip, curState.position)
{
HqlRegexExpr & curRegex = curState.position.item(ip);
if (curRegex.canConsume(curSymbol))
{
ForEachItemIn(idx2, curRegex.following)
nextState->position.add(&curRegex.following.item(idx2));
}
}
//If no states, then don't bother adding a transition - we're done..
if (nextState->position.ordinality())
{
bool isNew = false;
CInterface * castState = nextState;
unsigned matchIndex = dfa->orderedStates.bAdd(castState, compareState, isNew);
if (isNew)
dfa->states.append(*LINK(nextState));
else
nextState->Release();
HqlDfaState * matchedState = &dfa->orderedStates.item(matchIndex);
//Finally associate a transition for this symbol...
curState.transitions.append(*new HqlDfaTransition(curSymbol, matchedState));
}
else
nextState->Release();
}
}
}
/**
Some unicode pseudo code
ForEach(ip in curState.position)
{
position(ip).getConsumed(consumeSet);
positionSet = [ip];
if (consumeSet.ordinality())
Merge(consumeSet, 0, positionSet);
}
Merge(consumeSet, first, positionSet)
{
for (i = first; i < sets.ordinality(); i++)
{
diff = intersect(sets(i).consume, consumeSet);
if (diff.ordinality())
{
rightOnly = sets(i).consume - diff;
if (rightOnly.ordinality() == 0)
sets(i).positions.union(positionSet);
else
{
sets(i).consume = rightOnly;
Merge(diff, i+1, union(sets(i).positions, positionSet);
}
newOnly = consumeSet - diff;
if (newOnly = [])
return;
consumeSet = newOnly;
}
}
sets.append(consumeSet, positionSet);
}
Output would be a set of disjoint sets, together with a lits of positions that are matched by those sets.
They could then be used to generate the tables required by a dfa matcher list of
low, high, target-state
which was binary chopped.
It might be better to not use the icu sets, except for generating a set of ranges, and handle everything else ourselves.
Possibly a set of [low,high,positions], or maybe even [low, high, position], sorted by low
with some clever code to walk through and retain lists of which ones are active.
**/
//---------------------------------------------------------------------------
RegexContext::RegexContext(IHqlExpression * _expr, IWorkUnit * _wu, const HqlCppOptions & _options, ITimeReporter * _timeReporter, byte _algorithm) : NlpParseContext(_expr, _wu, _options, _timeReporter), parser(NULL, _algorithm)
{
info.addedSeparators = false;
switch (info.type)
{
case type_string: info.dfaComplexity = DEFAULT_DFA_COMPLEXITY; break;
case type_utf8: info.dfaComplexity = DEFAULT_UTF8_DFA_COMPLEXITY; break;
case type_unicode: info.dfaComplexity = DEFAULT_UNICODE_DFA_COMPLEXITY; break;
}
if (_options.parseDfaComplexity != (unsigned)-1)
info.dfaComplexity = _options.parseDfaComplexity;
info.expandRepeatAnyAsDfa = _options.expandRepeatAnyAsDfa;
createLexer = false;
}
RegexContext::~RegexContext()
{
ForEachItemIn(idx, named)
named.item(idx).cleanup();
}
HqlNamedRegex * RegexContext::queryNamed(IHqlExpression * defn, _ATOM name, node_operator op, bool caseSensitive)
{
ForEachItemIn(idx, named)
{
HqlNamedRegex & cur = named.item(idx);
if (cur.matches(defn, name, caseSensitive))
{
assertex(cur.matches(defn, name, op, caseSensitive));
return &cur;
}
}
return NULL;
}
HqlNamedRegex * RegexContext::createNamed(IHqlExpression * expr, _ATOM name, node_operator op, bool caseSensitive)
{
LinkedHqlExpr searchExpr = expr;
if (op != no_pat_instance)
//Create an expression that should never clash with anything we would find in the parse tree
searchExpr.setown(createValue(op, expr->getType(), LINK(expr), createAttribute(tempAtom)));
HqlNamedRegex * match = queryNamed(searchExpr, name, op, caseSensitive);
if (match)
match->addUse();
else
{
match = new HqlNamedRegex(expr, name, searchExpr, op, caseSensitive, isMatched(searchExpr, name));
named.append(*match); // add to list first to avoid recursion problems.
match->setRegexOwn(createStructure(expr, caseSensitive));
}
return match;
}
static HqlRegexExpr * createFollow(const ParseInformation & options, HqlRegexExpr * a1, HqlRegexExpr * a2, HqlRegexExpr * a3, bool caseSensitive)
{
HqlRegexExpr * follow = createRegexExpr(options, no_pat_follow, NULL, caseSensitive);
follow->addOperand(a1);
follow->addOperand(a2);
if (a3)
follow->addOperand(a3);
return follow;
}
HqlRegexExpr * RegexContext::createFollow(HqlRegexExpr * start, IHqlExpression * body, node_operator endOp, IHqlExpression * endExpr, bool caseSensitive)
{
HqlRegexExpr * next = createStructure(body, caseSensitive);
HqlRegexExpr * finish = createRegexExpr(info, endOp, endExpr, caseSensitive);
return ::createFollow(info, start, next, finish, caseSensitive);
}
HqlRegexExpr * RegexContext::createStructure(IHqlExpression * expr, bool caseSensitive)
{
node_operator op = expr->getOperator();
switch (op)
{
case no_pat_production:
throwError(HQLERR_RegexNoTransformSupport);
case no_pat_instance:
{
IHqlExpression * def = expr->queryChild(0);
if (createLexer)
return createStructure(def, caseSensitive);
Owned instance = createRegexExpr(info, expr, caseSensitive);
IHqlExpression * nameExpr = expr->queryChild(1);
instance->setNamed(createNamed(def, nameExpr ? nameExpr->queryName() : NULL, op, caseSensitive));
return instance.getClear();
}
case no_pat_guard:
return createRegexExpr(info, no_null, NULL, caseSensitive);
case no_attr:
case no_attr_expr:
case no_attr_link:
return NULL;
case no_implicitcast:
//can sometimes occur when parameters are bound
return createStructure(expr->queryChild(0), caseSensitive);
case no_pat_const:
if (createLexer)
return expandStringAsChars(expr, info, caseSensitive);
return createRegexExpr(info, expr, caseSensitive);
case no_pat_set:
case no_penalty:
//Don't walk children...
return createRegexExpr(info, expr, caseSensitive);
case no_pat_x_before_y:
case no_pat_x_after_y:
{
Owned pattern = createStructure(expr->queryChild(0), caseSensitive);
Owned check = createRegexExpr(info, expr, caseSensitive);
check->setSubPattern(createNamed(expr->queryChild(1), NULL, no_pat_checkin, caseSensitive));
if (op == no_pat_x_before_y)
return ::createFollow(info, pattern.getClear(), check.getClear(), NULL, caseSensitive);
else
return ::createFollow(info, check.getClear(), pattern.getClear(), NULL, caseSensitive);
}
case no_pat_before_y:
case no_pat_after_y:
{
Owned check = createRegexExpr(info, expr, caseSensitive);
check->setSubPattern(createNamed(expr->queryChild(0), NULL, no_pat_checkin, caseSensitive));
return check.getClear();
}
case no_pat_checkin:
{
Owned start = createRegexExpr(info, no_pat_begincheck, expr, caseSensitive);
Owned next = createStructure(expr->queryChild(0), caseSensitive);
Owned finish = createRegexExpr(info, no_pat_endcheckin, expr, caseSensitive);
finish->setSubPattern(createNamed(expr->queryChild(1), NULL, no_pat_checkin, caseSensitive));
return ::createFollow(info, start.getClear(), next.getClear(), finish.getClear(), caseSensitive);
}
case no_pat_checklength:
{
HqlRegexExpr * start = createRegexExpr(info, no_pat_begincheck, expr, caseSensitive);
return createFollow(start, expr->queryChild(0), no_pat_endchecklength, expr, caseSensitive);
}
case no_pat_validate:
{
HqlRegexExpr * start = createRegexExpr(info, no_pat_beginvalidate, expr, caseSensitive);
return createFollow(start, expr->queryChild(0), no_pat_endvalidate, expr, caseSensitive);
}
case no_pat_token:
case no_pat_imptoken:
{
IHqlExpression * child = expr->queryChild(0);
//MORE: Covert into a function to allow multiple values of following:
switch (child->getOperator())
{
case no_null:
case no_pat_first:
case no_pat_last:
return createStructure(child, caseSensitive);
case no_penalty:
case no_pat_x_before_y:
case no_pat_before_y:
case no_pat_x_after_y:
case no_pat_after_y:
case no_pat_guard:
return createStructure(child, caseSensitive);
case no_pat_const:
// if (!info.separator)
// return createStructure(child);
break;
}
HqlRegexExpr * start = createRegexExpr(info, no_pat_begintoken, expr, caseSensitive);
return createFollow(start, expr->queryChild(0), no_pat_endtoken, expr, caseSensitive);
}
case no_pat_repeat:
{
HqlRegexExpr * ret = createRegexExpr(info, expr, caseSensitive);
IHqlExpression * repeated = expr->queryChild(0);
if (ret->isStandardRepeat())
ret->addOperand(createStructure(repeated, caseSensitive));
else
ret->setNamed(createNamed(repeated, NULL, op, caseSensitive));
return ret;
}
case no_pat_or:
case no_pat_follow:
{
//Expand these out as much as possible....
HqlExprArray args;
expr->unwindList(args, op);
OwnedHqlExpr flatExpr = expr->clone(args);
Owned ret = createRegexExpr(info, flatExpr, caseSensitive);
ForEachChild(idx, flatExpr)
{
HqlRegexExpr * child = createStructure(flatExpr->queryChild(idx), caseSensitive);
if (child)
ret->addOperand(child);
}
return ret.getClear();
}
case no_pat_case:
case no_pat_nocase:
return createStructure(expr->queryChild(0), op==no_pat_case);
case no_pat_featureactual:
throwError(HQLERR_RegexFeatureNotSupport);
}
Owned ret = createRegexExpr(info, expr, caseSensitive);
ForEachChild(idx, expr)
{
HqlRegexExpr * child = createStructure(expr->queryChild(idx), caseSensitive);
if (child)
ret->addOperand(child);
}
return ret.getClear();
}
void RegexContext::buildStructure()
{
unsigned startTime = msTick();
IHqlExpression * grammar = expr->queryChild(2);
assertex(grammar->getOperator() == no_pat_instance);
_ATOM name = grammar->queryChild(1)->queryName();
OwnedHqlExpr structure = LINK(grammar);//createValue(no_pat_instance, makeRuleType(NULL), LINK(grammar), LINK(grammar->queryChild(1)));
HqlRegexExpr * rootRegex = createStructure(structure, isCaseSensitive());
root.setown(new HqlNamedRegex(structure, internalAtom, structure, no_parse, isCaseSensitive(), false));
root->setRegexOwn(rootRegex);
named.append(*LINK(root));
DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Create Structure", msTick()-startTime);
}
void RegexContext::expandRecursion()
{
//First add all the symbols referenced by the use() attributes on the parse so they can be matched.
ForEachChild(idx, expr)
{
IHqlExpression * cur = expr->queryChild(idx);
if (cur->getOperator() == no_pat_use)
{
//NB: Does not return a linked item.
HqlNamedRegex * named = createNamed(cur->queryChild(0), NULL, no_pat_instance, true);
named->removeUse(); // Not actually used at the moment...
HqlNamedRegex * named2 = createNamed(cur->queryChild(0), NULL, no_pat_instance, false);
named2->removeUse(); // Not actually used at the moment...
}
}
root->expandRecursion(*this, NULL);
ForEachItemInRev(idx2, named)
if (!named.item(idx2).queryExpandedRecursion())
named.remove(idx2);
}
void RegexContext::insertSeparators()
{
if (info.separator)
{
ForEachItemIn(idx, named)
named.item(idx).insertSeparators(info.separator, this);
//add separator/first onto the front of root.
info.addedSeparators = true;
}
}
void RegexContext::optimizeSpotDFA()
{
if (info.dfaComplexity > 0)
{
ForEachItemIn(idx, named)
named.item(idx).calcDfaScore();
root->markDFAs(info.dfaComplexity);
ForEachItemIn(idx2, named)
named.item(idx2).createDFAs(*this);
}
}
void RegexContext::optimizePattern()
{
unsigned startTime = msTick();
ForEachItemIn(idx1, named)
named.item(idx1).mergeCreateSets();
root->expandNamedSymbols();
ForEachItemInRev(idx2, named)
{
if (!named.item(idx2).isUsed())
{
//Can't delete it otherwise everything gets cleaned up...
deadNamed.append(named.item(idx2));
named.remove(idx2, true);
}
}
optimizeSpotDFA();
DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Optimize", msTick()-startTime);
}
HqlNamedRegex * RegexContext::queryDefine(IHqlExpression * defineName, bool caseSensitive)
{
ForEachItemIn(idx, named)
{
HqlNamedRegex & cur = named.item(idx);
if (cur.matchesDefine(defineName,caseSensitive))
return &cur;
}
StringBuffer s;
defineName->toString(s);
throwError1(HQLERR_DefineUseStrNotFound, s.str());
return NULL;
}
void RegexContext::analysePattern()
{
unsigned startTime = msTick();
//This conversion is based around the description in the Dragon book:
//3.9 From a regular expression to a DFA
//even though we don't always convert it, the steps form a useful algorithm
ForEachItemIn(idx0, named)
named.item(idx0).addBeginEnd(info);
ForEachItemIn(idx1, named)
named.item(idx1).analyseNullLeadTrail();
ForEachItemIn(idx2, named)
named.item(idx2).calculateNext();
ForEachItemIn(idx3, named)
named.item(idx3).generateDFAs();
DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Analyse", msTick()-startTime);
}
void RegexContext::generateRegex()
{
unsigned startTime = msTick();
parser.addedSeparators = info.addedSeparators;
setParserOptions(parser);
GenerateRegexCtx ctx(*this, info, idAllocator);
ForEachItemIn(idx0, named)
{
HqlNamedRegex & cur = named.item(idx0);
cur.generateRegex(ctx);
}
parser.grammar.set(root->queryRootPattern());
parser.minPatternLength = root->getMinLength();
DEBUG_TIMERX(timeReporter, "EclServer: Generate PARSE: Generate", msTick()-startTime);
}
//void RegexContext::removeTrivialInstances()
//{
// ForEachItemIn(idx, named)
// named.item(idx).removeTrivialInstances();
//}
void RegexContext::compileSearchPattern()
{
checkValidMatches();
buildStructure();
expandRecursion();
// removeTrivialInstances();
insertSeparators();
optimizePattern();
analysePattern();
generateRegex();
compileMatched(parser);
}
void RegexContext::getDebugText(StringBuffer & s, unsigned detail)
{
NlpParseContext::getDebugText(s, detail);
regexToXml(s, parser.grammar, detail);
}
NlpParseContext * createRegexContext(IHqlExpression * expr, IWorkUnit * wu, const HqlCppOptions & options, ITimeReporter * timeReporter, byte algorithm)
{
return new RegexContext(expr, wu, options, timeReporter, algorithm);
}
//-- Lexer creation
void RegexContext::beginLexer()
{
createLexer = true;
info.expandRepeatAnyAsDfa = true;
OwnedHqlExpr searchExpr = createValue(no_pat_dfa, makePatternType(), createUniqueId());
lexerNamed.setown(new HqlNamedRegex(searchExpr, NULL, searchExpr, no_pat_dfa, isCaseSensitive(), false));
lexerOr.setown(createRegexExpr(info, no_pat_or, NULL, isCaseSensitive()));
lexerNamed->setRegexOwn(LINK(lexerOr));
lexerRoot.setown(createRegexExpr(info, no_pat_dfa, NULL, isCaseSensitive()));
lexerRoot->setNamed(lexerNamed);
root.setown(new HqlNamedRegex(expr, NULL, expr, no_parse, isCaseSensitive(), false));
root->setRegexOwn(LINK(lexerRoot));
named.append(*LINK(lexerNamed));
named.append(*LINK(root));
}
void RegexContext::addLexerToken(unsigned id, IHqlExpression * pattern)
{
HqlRegexExpr * token = createStructure(pattern, isCaseSensitive());
HqlRegexExpr * end = createRegexExpr(info, no_pat_endpattern, NULL, isCaseSensitive());
end->setAcceptId(id);
if (token->getOperator() == no_pat_follow)
token->addOperand(end);
else
{
HqlRegexExpr * follow = createRegexExpr(info, no_pat_follow, NULL, isCaseSensitive());
follow->addOperand(token);
follow->addOperand(end);
token = follow;
}
lexerOr->addOperand(token);
}
void RegexContext::generateLexer(IDfaPattern * builder)
{
//MORE: Need to call some elements of optimizePattern() to expand limited repeats.
analysePattern();
lexerRoot->generateDFA(builder);
}
/*
ToDo:
o Should move some of the logic from HqlregexExpr to complex + remove the queyNamed() etc. virtuals.
*/